mirror of
https://github.com/Radiquum/furaffinity-dl.git
synced 2025-04-05 15:54:38 +00:00
211 lines
8.2 KiB
Python
Executable file
211 lines
8.2 KiB
Python
Executable file
#!/usr/bin/python3
|
|
import argparse
|
|
from tqdm import tqdm
|
|
from argparse import RawTextHelpFormatter
|
|
import json
|
|
from bs4 import BeautifulSoup
|
|
import requests
|
|
import http.cookiejar as cookielib
|
|
import re
|
|
import os
|
|
from time import sleep
|
|
|
|
'''
|
|
Please refer to LICENSE for licensing conditions.
|
|
'''
|
|
|
|
# Argument parsing
|
|
parser = argparse.ArgumentParser(formatter_class=RawTextHelpFormatter, description='Downloads the entire gallery/scraps/favorites of a furaffinity user', epilog='''
|
|
Examples:
|
|
python3 furaffinity-dl.py gallery koul
|
|
python3 furaffinity-dl.py -o koulsArt gallery koul
|
|
python3 furaffinity-dl.py -o mylasFavs favorites mylafox\n
|
|
You can also log in to FurAffinity in a web browser and load cookies to download restricted content:
|
|
python3 furaffinity-dl.py -c cookies.txt gallery letodoesart\n
|
|
DISCLAIMER: It is your own responsibility to check whether batch downloading is allowed by FurAffinity terms of service and to abide by them.
|
|
''')
|
|
parser.add_argument('category', metavar='category', type=str, nargs='?', default='gallery', help='the category to download, gallery/scraps/favorites')
|
|
parser.add_argument('username', metavar='username', type=str, nargs='?', help='username of the furaffinity user')
|
|
parser.add_argument('--output', '-o', dest='output', type=str, default='.', help="output directory")
|
|
parser.add_argument('--cookies', '-c', dest='cookies', type=str, default='', help="path to a NetScape cookies file")
|
|
parser.add_argument('--ua', '-u', dest='ua', type=str, default='Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.7) Gecko/20100101 Firefox/68.7', help="Your browser's useragent, may be required, depending on your luck")
|
|
parser.add_argument('--start', '-s', dest='start', type=str, default=1, help="page number to start from")
|
|
parser.add_argument('--dont-redownload', '-d', const='dont_redownload', action='store_const', help="Don't redownload files that have already been downloaded")
|
|
parser.add_argument('--interval', '-i', dest='interval', type=float, default=0, help="delay between downloading pages")
|
|
parser.add_argument('--metadir', '-m', dest='metadir', type=str, default=None, help="directory to put meta files in")
|
|
|
|
args = parser.parse_args()
|
|
if args.username is None:
|
|
parser.print_help()
|
|
exit()
|
|
|
|
# Create output directory if it doesn't exist
|
|
if args.output != '.':
|
|
os.makedirs(args.output, exist_ok=True)
|
|
|
|
if args.metadir == None:
|
|
args.metadir = args.output
|
|
else:
|
|
os.makedirs(args.metadir, exist_ok=True)
|
|
|
|
|
|
# Check validity of category
|
|
valid_categories = ['gallery', 'favorites', 'scraps']
|
|
if args.category not in valid_categories:
|
|
raise Exception('Category is not valid', args.category)
|
|
|
|
# Check validity of username
|
|
if bool(re.compile(r'[^a-zA-Z0-9\-~._]').search(args.username)):
|
|
raise Exception('Username contains non-valid characters', args.username)
|
|
|
|
# Initialise a session
|
|
session = requests.session()
|
|
session.headers.update({'User-Agent': args.ua})
|
|
|
|
# Load cookies from a netscape cookie file (if provided)
|
|
if args.cookies != '':
|
|
cookies = cookielib.MozillaCookieJar(args.cookies)
|
|
cookies.load()
|
|
session.cookies = cookies
|
|
|
|
base_url = 'https://www.furaffinity.net'
|
|
gallery_url = '{}/{}/{}'.format(base_url, args.category, args.username)
|
|
page_num = args.start
|
|
|
|
|
|
def download_file(url, fname, desc):
|
|
r = session.get(url, stream=True)
|
|
if r.status_code != 200:
|
|
print("Got a HTTP {} while downloading; skipping".format(r.status_code))
|
|
return False
|
|
|
|
total = int(r.headers.get('Content-Length', 0))
|
|
with open(fname, 'wb') as file, tqdm(
|
|
desc=desc.ljust(40)[:40],
|
|
total=total,
|
|
miniters=100,
|
|
unit='b',
|
|
unit_scale=True,
|
|
unit_divisor=1024
|
|
) as bar:
|
|
for data in r.iter_content(chunk_size=1024):
|
|
size = file.write(data)
|
|
bar.update(size)
|
|
return True
|
|
|
|
|
|
# The cursed function that handles downloading
|
|
def download(path):
|
|
page_url = '{}{}'.format(base_url, path)
|
|
response = session.get(page_url)
|
|
s = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
# System messages
|
|
if s.find(class_='notice-message') is not None:
|
|
message = s.find(class_='notice-message').find('div').find(class_="link-override").text.strip()
|
|
raise Exception('System Message', message)
|
|
|
|
image = s.find(class_='download').find('a').attrs.get('href')
|
|
title = s.find(class_='submission-title').find('p').contents[0]
|
|
filename = image.split("/")[-1:][0]
|
|
data = {
|
|
'id': int(path.split('/')[-2:-1][0]),
|
|
'filename': filename,
|
|
'author': s.find(class_='submission-id-sub-container').find('a').find('strong').text,
|
|
'date': s.find(class_='popup_date').attrs.get('title'),
|
|
'title': title,
|
|
'description': s.find(class_='submission-description').text.strip().replace('\r\n', '\n'),
|
|
"tags": [],
|
|
'category': s.find(class_='info').find(class_='category-name').text,
|
|
'type': s.find(class_='info').find(class_='type-name').text,
|
|
'species': s.find(class_='info').findAll('div')[2].find('span').text,
|
|
'gender': s.find(class_='info').findAll('div')[3].find('span').text,
|
|
'views': int(s.find(class_='views').find(class_='font-large').text),
|
|
'favorites': int(s.find(class_='favorites').find(class_='font-large').text),
|
|
'rating': s.find(class_='rating-box').text.strip(),
|
|
'comments': []
|
|
}
|
|
|
|
# Extact tags
|
|
for tag in s.find(class_='tags-row').findAll(class_='tags'):
|
|
data['tags'].append(tag.find('a').text)
|
|
|
|
# Extract comments
|
|
for comment in s.findAll(class_='comment_container'):
|
|
temp_ele = comment.find(class_='comment-parent')
|
|
parent_cid = None if temp_ele is None else int(temp_ele.attrs.get('href')[5:])
|
|
|
|
# Comment is deleted or hidden
|
|
if comment.find(class_='comment-link') is None:
|
|
continue
|
|
|
|
data['comments'].append({
|
|
'cid': int(comment.find(class_='comment-link').attrs.get('href')[5:]),
|
|
'parent_cid': parent_cid,
|
|
'content': comment.find(class_='comment_text').contents[0].strip(),
|
|
'username': comment.find(class_='comment_username').text,
|
|
'date': comment.find(class_='popup_date').attrs.get('title')
|
|
})
|
|
|
|
url = 'https:{}'.format(image)
|
|
output_path = os.path.join(args.output, filename)
|
|
|
|
if not args.dont_redownload or not os.path.isfile(output_path):
|
|
if not download_file(url, output_path, data["title"]):
|
|
return False
|
|
else:
|
|
print('Skipping "{}", since it\'s already downloaded'.format(data["title"]))
|
|
return True
|
|
|
|
# Write a UTF-8 encoded JSON file for metadata
|
|
with open(os.path.join(args.metadir, '{}.json'.format(filename)), 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, ensure_ascii=False, indent=4)
|
|
|
|
return True
|
|
|
|
|
|
# Main downloading loop
|
|
while True:
|
|
page_url = '{}/{}'.format(gallery_url, page_num)
|
|
response = session.get(page_url)
|
|
s = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
# Account status
|
|
if page_num == 1:
|
|
if s.find(class_='loggedin_user_avatar') is not None:
|
|
account_username = s.find(class_='loggedin_user_avatar').attrs.get('alt')
|
|
print('Logged in as', account_username)
|
|
else:
|
|
print('Not logged in, NSFW content is inaccessible')
|
|
|
|
# System messages
|
|
if s.find(class_='notice-message') is not None:
|
|
message = s.find(class_='notice-message').find('div').find(class_="link-override").text.strip()
|
|
raise Exception('System Message', message)
|
|
|
|
# End of gallery
|
|
if s.find(id='no-images') is not None:
|
|
print('End of gallery')
|
|
break
|
|
|
|
# Download all images on the page
|
|
for img in s.findAll('figure'):
|
|
download(img.find('a').attrs.get('href'))
|
|
sleep(args.interval)
|
|
|
|
# Favorites galleries use a weird timestamp system, so grab the next "page" from the Next button
|
|
if args.category == 'favorites':
|
|
next_button = s.find('a', class_='button standard right')
|
|
if next_button is None:
|
|
break
|
|
|
|
# URL looks something like /favorites/:username/:timestamp/next
|
|
# Splitting on the username is more robust to future URL changes
|
|
page_num = next_button.attrs['href'].split(args.username + '/')[-1]
|
|
else:
|
|
page_num += 1
|
|
|
|
print('Downloading page', page_num)
|
|
|
|
|
|
print('Finished downloading')
|