Swtich to requests, tidy up README, add requirements.txt

This commit is contained in:
Xerbo 2020-07-17 18:47:24 +01:00
parent c87a1f5355
commit 071e8692ad
3 changed files with 52 additions and 35 deletions

View file

@ -1,38 +1,44 @@
This branch is the development version of furaffinity-dl rewritten in python.
# FurAffinity Downloader
**furaffinity-dl** was a bash script (now in python) for batch downloading of galleries (and scraps/favourites) from furaffinity users.
**furaffinity-dl** is a python script for batch downloading of galleries (and scraps/favourites) from furaffinity users.
It was written for preservation of culture, to counter the people nuking their galleries every once a while.
Supports all known submission types: images, texts and audio.
Supports all known submission types: images, text, flash and audio.
## Requirements
Exact requirements are unknown since its still in development, but you should only need `beautifulsoup4` to be installed (`pip3 install beautifulsoup4`). I will put a `requirements.txt` file in the repo soon.
`pip3 install -r requirements.txt`
**The script currently only works with the "Modern" theme**
furaffinity-dl has only been tested only on Linux, however it should also work on Mac, Windows and any other platform that supports python.
furaffinity-dl has only been tested only on Linux, however it should also work on Mac, Windows or any other platform that supports python.
## Usage
Run it with
`./furaffinity-dl.py category username`
Run it with:
`./furaffinity-dl.py category username`
or:
`python3 furaffinity-dl.py category username`
`python3 furaffinity-dl.py category username`
All files from the given section and user will be downloaded to the current directory.
### Examples
`python3 furaffinity-dl.py gallery koul`
`python3 furaffinity-dl.py -o koulsArt gallery koul`
`python3 furaffinity-dl.py gallery koul`
`python3 furaffinity-dl.py -o mylasFavs favorites mylafox`
`python3 furaffinity-dl.py -o koulsArt gallery koul`
`python3 furaffinity-dl.py -o mylasFavs favorites mylafox`
For a full list of command line arguments use `./furaffinity-dl -h`.
You can also log in to download restricted content. To do that, log in to FurAffinity in your web browser, export cookies to a file from your web browser in Netscape format (there are extensions to do that [for Firefox](https://addons.mozilla.org/en-US/firefox/addon/ganbo/) and [for Chrome base browsers](https://chrome.google.com/webstore/detail/cookiestxt/njabckikapfpffapmjgojcnbfjonfjfg)), you can then pass them to the script with the `-c` flag, like this (you may also have to provide your user-agent):
You can also log in to download restricted content. To do that, log in to FurAffinity in your web browser, export cookies to a file from your web browser in Netscape format (there are extensions to do that [for Firefox](https://addons.mozilla.org/en-US/firefox/addon/ganbo/) and [for Chrome based browsers](https://chrome.google.com/webstore/detail/cookiestxt/njabckikapfpffapmjgojcnbfjonfjfg)), you can then pass them to the script with the `-c` flag, like this (you may also have to provide your user agent):
`python3 furaffinity-dl.py -c cookies.txt -u 'Mozilla/5.0 ....' gallery letodoesart`
`python3 furaffinity-dl.py -c cookies.txt -u 'Mozilla/5.0 ....' gallery letodoesart`
## TODO
@ -41,4 +47,5 @@ You can also log in to download restricted content. To do that, log in to FurAff
- Login without having to export cookies
## Disclaimer
It is your own responsibility to check whether batch downloading is allowed by FurAffinity's terms of service and to abide by them. For further disclaimers see LICENSE.

View file

@ -1,12 +1,11 @@
#!/usr/bin/python3
import argparse
from tqdm import tqdm
from argparse import RawTextHelpFormatter
import json
from bs4 import BeautifulSoup
import requests
import urllib.request
import http.cookiejar as cookielib
import urllib.parse
import re
import os
@ -19,7 +18,6 @@ current ideas / things to do:
metadata injection (gets messy easily)
sqlite database
support for classic theme
using `requests` instead of `urllib`
turn this into a module
'''
@ -61,7 +59,7 @@ if bool(re.compile(r'[^a-zA-Z0-9\-~._]').search(args.username)):
raise Exception('Username contains non-valid characters', args.username)
# Initialise a session
session = requests.Session()
session = requests.session()
session.headers.update({'User-Agent': args.ua})
# Load cookies from a netscape cookie file (if provided)
@ -74,9 +72,28 @@ base_url = 'https://www.furaffinity.net'
gallery_url = '{}/{}/{}'.format(base_url, args.category, args.username)
page_num = args.start
def download_file(url, fname, desc):
r = session.get(url, stream=True)
if r.status_code != 200:
print("Got a HTTP {} while downloading; skipping".format(r.status_code))
return False
total = int(r.headers.get('Content-Length', 0))
with open(fname, 'wb') as file, tqdm(
desc=desc.ljust(40)[:40],
total=total,
miniters=100,
unit='b',
unit_scale=True,
unit_divisor=1024
) as bar:
for data in r.iter_content(chunk_size=1024):
size = file.write(data)
bar.update(size)
return True
# The cursed function that handles downloading
def download_file(path):
def download(path):
page_url = '{}{}'.format(base_url, path)
response = session.get(page_url)
s = BeautifulSoup(response.text, 'html.parser')
@ -111,7 +128,7 @@ def download_file(path):
temp_ele = comment.find(class_='comment-parent')
parent_cid = None if temp_ele is None else int(temp_ele.attrs.get('href')[5:])
# Comment deleted or hidden
# Comment is deleted or hidden
if comment.find(class_='comment-link') is None:
continue
@ -123,21 +140,11 @@ def download_file(path):
'date': comment.find(class_='popup_date').attrs.get('title')
})
print('Downloading "{}"... '.format(title))
# Because for some god forsaken reason FA keeps the original filename in the upload, in the case that it contains non-ASCII
# characters it can make this thing blow up. So we have to do some annoying IRI stuff to make it work. Maybe consider `requests`
# instead of `urllib`
def strip_non_ascii(s): return ''.join(i for i in s if ord(i) < 128)
url = 'https:{}'.format(image)
url = urllib.parse.urlsplit(url)
url = list(url)
url[2] = urllib.parse.quote(url[2])
url = urllib.parse.urlunsplit(url)
try:
urllib.request.urlretrieve(url, os.path.join(args.output, strip_non_ascii(filename)))
except urllib.error.HTTPError:
print("404 Not Found, skipping")
url ='https:{}'.format(image)
output_path = os.path.join(args.output, filename)
if not download_file(url, output_path, data["title"]):
return False
# Write a UTF-8 encoded JSON file for metadata
with open(os.path.join(args.output, '{}.json'.format(filename)), 'w', encoding='utf-8') as f:
@ -156,7 +163,7 @@ while True:
account_username = s.find(class_='loggedin_user_avatar').attrs.get('alt')
print('Logged in as', account_username)
else:
print('Not logged in, some users gallery\'s may be unaccessible and NSFW content is not downloadable')
print('Not logged in, NSFW content is unaccessible')
# System messages
if s.find(class_='notice-message') is not None:
@ -174,7 +181,7 @@ while True:
# Download all images on the page
for img in s.findAll('figure'):
download_file(img.find('a').attrs.get('href'))
download(img.find('a').attrs.get('href'))
page_num += 1
print('Downloading page', page_num)

3
requirements.txt Normal file
View file

@ -0,0 +1,3 @@
beautifulsoup4
requests
tqdm