#!/usr/bin/python3 import argparse from argparse import RawTextHelpFormatter import json from bs4 import BeautifulSoup import requests import urllib.request import http.cookiejar as cookielib import urllib.parse import re import os ''' Please refer to LICENSE for licensing conditions. current ideas / things to do: -r replenish, keep downloading until it finds a already downloaded file -n number of posts to download file renaming to title metadata injection (gets messy easily) sqlite database support for beta theme using `requests` instead of `urllib` turn this into a module ''' # Argument parsing parser = argparse.ArgumentParser(formatter_class=RawTextHelpFormatter, description='Downloads the entire gallery/scraps/favorites of a furaffinity user', epilog=''' Examples: python3 fadl.py gallery koul python3 fadl.py -o koulsArt gallery koul python3 fadl.py -o mylasFavs favorites mylafox\n You can also log in to FurAffinity in a web browser and load cookies to download restricted content: python3 fadl.py -c cookies.txt gallery letodoesart\n DISCLAIMER: It is your own responsibility to check whether batch downloading is allowed by FurAffinity terms of service and to abide by them. ''') parser.add_argument('category', metavar='category', type=str, nargs='?', default='gallery', help='the category to download, gallery/scraps/favorites') parser.add_argument('username', metavar='username', type=str, nargs='?', help='username of the furaffinity user') parser.add_argument('-o', metavar='output', dest='output', type=str, default='.', help="output directory") parser.add_argument('-c', metavar='cookies', dest='cookies', type=str, default='', help="path to a NetScape cookies file") parser.add_argument('-s', metavar='start', dest='start', type=int, default=1, help="page number to start from") args = parser.parse_args() if args.username == None: parser.print_help() exit() # Create output directory if it doesn't exist if args.output != '.': os.makedirs(args.output, exist_ok=True) # Check validity of category valid_categories = ['gallery', 'favorites', 'scraps'] if not args.category in valid_categories: raise Exception('Category is not valid', args.category) # Check validity of username if bool(re.compile(r'[^a-zA-Z0-9\-~._]').search(args.username)): raise Exception('Username contains non-valid characters', args.username) # Initialise a session session = requests.Session() session.headers.update({'User-Agent': 'furaffinity-dl redevelopment'}) # Load cookies from a netscape cookie file (if provided) if args.cookies != '': cookies = cookielib.MozillaCookieJar(args.cookies) cookies.load() session.cookies = cookies base_url = 'https://www.furaffinity.net' gallery_url = '{}/gallery/{}'.format(base_url, args.username) page_num = args.start # The cursed function that handles downloading def download_file(path): page_url = '{}{}'.format(base_url, path) response = session.get(page_url) s = BeautifulSoup(response.text, 'html.parser') image = s.find(class_='download').find('a').attrs.get('href') title = s.find(class_='submission-title').find('p').contents[0]; filename = image.split("/")[-1:][0] data = { 'id': int(path.split('/')[-2:-1][0]), 'filename': filename, 'author': s.find(class_='submission-id-sub-container').find('a').find('strong').text, 'date': s.find(class_='popup_date').attrs.get('title'), 'title': title, 'description': s.find(class_='submission-description').text.strip().replace('\r\n', '\n'), "tags": [], 'category': s.find(class_='info').find(class_='category-name').text, 'type': s.find(class_='info').find(class_='type-name').text, 'species': s.find(class_='info').findAll('div')[2].find('span').text, 'gender': s.find(class_='info').findAll('div')[3].find('span').text, 'views': int(s.find(class_='views').find(class_='font-large').text), 'favorites': int(s.find(class_='favorites').find(class_='font-large').text), 'rating': s.find(class_='rating-box').text.strip(), 'comments': [] } # Extact tags for tag in s.find(class_='tags-row').findAll(class_='tags'): data['tags'].append(tag.find('a').text) # Extract comments for comment in s.findAll(class_='comment_container'): temp_ele = comment.find(class_='comment-parent') parent_cid = None if temp_ele == None else int(temp_ele.attrs.get('href')[5:]) # Comment deleted or hidden if comment.find(class_='comment-link') == None: continue data['comments'].append({ 'cid': int(comment.find(class_='comment-link').attrs.get('href')[5:]), 'parent_cid': parent_cid, 'content': comment.find(class_='comment_text').contents[0].strip(), 'username': comment.find(class_='comment_username').text, 'date': comment.find(class_='popup_date').attrs.get('title') }) # Write a UTF-8 encoded JSON file for metadata with open(os.path.join(args.output, '{}.json'.format(filename)), 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=4) print('Downloading "{}"... '.format(title)) # Because for some god forsaken reason FA keeps the original filename in the upload, in the case that it contains non-ASCII # characters it can make this thing blow up. So we have to do some annoying IRI stuff to make it work. Maybe consider `requests` # instead of urllib def strip_non_ascii(s): return ''.join(i for i in s if ord(i) < 128) url = 'https:{}'.format(image) url = urllib.parse.urlsplit(url) url = list(url) url[2] = urllib.parse.quote(url[2]) url = urllib.parse.urlunsplit(url) urllib.request.urlretrieve(url, os.path.join(args.output, strip_non_ascii(filename))) # Main downloading loop while True: page_url = '{}/{}'.format(gallery_url, page_num) response = session.get(page_url) s = BeautifulSoup(response.text, 'html.parser') # Account status if page_num == 1: if s.find(class_='loggedin_user_avatar') != None: account_username = s.find(class_='loggedin_user_avatar').attrs.get('alt') print('Logged in as', account_username) else: print('Not logged in, some users gallery\'s may be unaccessible and NSFW content is not downloadable') # System messages if s.find(class_='notice-message') != None: message = s.find(class_='notice-message').find('div') for ele in message: if ele.name != None: ele.decompose() raise Exception('System Message', message.text.strip()) # End of gallery if s.find(id='no-images') != None: print('End of gallery') break # Download all images on the page for img in s.findAll('figure'): download_file(img.find('a').attrs.get('href')) page_num += 1 print('Downloading page', page_num) print('Finished downloading')