A 404 does not break the script now

Slight refactor as well
This commit is contained in:
Xerbo 2020-05-21 00:30:13 +01:00
parent e2ff807c40
commit c87a1f5355

View file

@ -16,10 +16,9 @@ Please refer to LICENSE for licensing conditions.
current ideas / things to do: current ideas / things to do:
-r replenish, keep downloading until it finds a already downloaded file -r replenish, keep downloading until it finds a already downloaded file
-n number of posts to download -n number of posts to download
file renaming to title
metadata injection (gets messy easily) metadata injection (gets messy easily)
sqlite database sqlite database
support for beta theme support for classic theme
using `requests` instead of `urllib` using `requests` instead of `urllib`
turn this into a module turn this into a module
''' '''
@ -44,7 +43,7 @@ parser.add_argument('-u', metavar='useragent', dest='ua', type=str, default='Moz
parser.add_argument('-s', metavar='start', dest='start', type=int, default=1, help="page number to start from") parser.add_argument('-s', metavar='start', dest='start', type=int, default=1, help="page number to start from")
args = parser.parse_args() args = parser.parse_args()
if args.username == None: if args.username is None:
parser.print_help() parser.print_help()
exit() exit()
@ -54,13 +53,13 @@ if args.output != '.':
# Check validity of category # Check validity of category
valid_categories = ['gallery', 'favorites', 'scraps'] valid_categories = ['gallery', 'favorites', 'scraps']
if not args.category in valid_categories: if args.category not in valid_categories:
raise Exception('Category is not valid', args.category) raise Exception('Category is not valid', args.category)
# Check validity of username # Check validity of username
if bool(re.compile(r'[^a-zA-Z0-9\-~._]').search(args.username)): if bool(re.compile(r'[^a-zA-Z0-9\-~._]').search(args.username)):
raise Exception('Username contains non-valid characters', args.username) raise Exception('Username contains non-valid characters', args.username)
# Initialise a session # Initialise a session
session = requests.Session() session = requests.Session()
session.headers.update({'User-Agent': args.ua}) session.headers.update({'User-Agent': args.ua})
@ -72,9 +71,10 @@ if args.cookies != '':
session.cookies = cookies session.cookies = cookies
base_url = 'https://www.furaffinity.net' base_url = 'https://www.furaffinity.net'
gallery_url = '{}/gallery/{}'.format(base_url, args.username) gallery_url = '{}/{}/{}'.format(base_url, args.category, args.username)
page_num = args.start page_num = args.start
# The cursed function that handles downloading # The cursed function that handles downloading
def download_file(path): def download_file(path):
page_url = '{}{}'.format(base_url, path) page_url = '{}{}'.format(base_url, path)
@ -82,7 +82,7 @@ def download_file(path):
s = BeautifulSoup(response.text, 'html.parser') s = BeautifulSoup(response.text, 'html.parser')
image = s.find(class_='download').find('a').attrs.get('href') image = s.find(class_='download').find('a').attrs.get('href')
title = s.find(class_='submission-title').find('p').contents[0]; title = s.find(class_='submission-title').find('p').contents[0]
filename = image.split("/")[-1:][0] filename = image.split("/")[-1:][0]
data = { data = {
'id': int(path.split('/')[-2:-1][0]), 'id': int(path.split('/')[-2:-1][0]),
@ -109,10 +109,10 @@ def download_file(path):
# Extract comments # Extract comments
for comment in s.findAll(class_='comment_container'): for comment in s.findAll(class_='comment_container'):
temp_ele = comment.find(class_='comment-parent') temp_ele = comment.find(class_='comment-parent')
parent_cid = None if temp_ele == None else int(temp_ele.attrs.get('href')[5:]) parent_cid = None if temp_ele is None else int(temp_ele.attrs.get('href')[5:])
# Comment deleted or hidden # Comment deleted or hidden
if comment.find(class_='comment-link') == None: if comment.find(class_='comment-link') is None:
continue continue
data['comments'].append({ data['comments'].append({
@ -123,10 +123,6 @@ def download_file(path):
'date': comment.find(class_='popup_date').attrs.get('title') 'date': comment.find(class_='popup_date').attrs.get('title')
}) })
# Write a UTF-8 encoded JSON file for metadata
with open(os.path.join(args.output, '{}.json'.format(filename)), 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
print('Downloading "{}"... '.format(title)) print('Downloading "{}"... '.format(title))
# Because for some god forsaken reason FA keeps the original filename in the upload, in the case that it contains non-ASCII # Because for some god forsaken reason FA keeps the original filename in the upload, in the case that it contains non-ASCII
@ -138,7 +134,15 @@ def download_file(path):
url = list(url) url = list(url)
url[2] = urllib.parse.quote(url[2]) url[2] = urllib.parse.quote(url[2])
url = urllib.parse.urlunsplit(url) url = urllib.parse.urlunsplit(url)
urllib.request.urlretrieve(url, os.path.join(args.output, strip_non_ascii(filename))) try:
urllib.request.urlretrieve(url, os.path.join(args.output, strip_non_ascii(filename)))
except urllib.error.HTTPError:
print("404 Not Found, skipping")
# Write a UTF-8 encoded JSON file for metadata
with open(os.path.join(args.output, '{}.json'.format(filename)), 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
# Main downloading loop # Main downloading loop
while True: while True:
@ -148,23 +152,23 @@ while True:
# Account status # Account status
if page_num == 1: if page_num == 1:
if s.find(class_='loggedin_user_avatar') != None: if s.find(class_='loggedin_user_avatar') is not None:
account_username = s.find(class_='loggedin_user_avatar').attrs.get('alt') account_username = s.find(class_='loggedin_user_avatar').attrs.get('alt')
print('Logged in as', account_username) print('Logged in as', account_username)
else: else:
print('Not logged in, some users gallery\'s may be unaccessible and NSFW content is not downloadable') print('Not logged in, some users gallery\'s may be unaccessible and NSFW content is not downloadable')
# System messages # System messages
if s.find(class_='notice-message') != None: if s.find(class_='notice-message') is not None:
message = s.find(class_='notice-message').find('div') message = s.find(class_='notice-message').find('div')
for ele in message: for ele in message:
if ele.name != None: if ele.name is not None:
ele.decompose() ele.decompose()
raise Exception('System Message', message.text.strip()) raise Exception('System Message', message.text.strip())
# End of gallery # End of gallery
if s.find(id='no-images') != None: if s.find(id='no-images') is not None:
print('End of gallery') print('End of gallery')
break break