From 675f558d03c30792b3da977991487ff561607e64 Mon Sep 17 00:00:00 2001 From: Radiquum Date: Sun, 10 Jul 2022 03:24:39 +0500 Subject: [PATCH] changelog: ability to add comments in username list with "#" autoremoval of "_" in usernames speedup filter checking add basic indexing -> speedup existing file checking for newer files other small changes --- Modules/config.py | 38 ++++++++++- Modules/download.py | 156 +++++++++++++++++++++++-------------------- Modules/functions.py | 60 +++++++++-------- Modules/index.py | 37 ++++++++++ furaffinity-dl.py | 91 +++++++++++++++++++------ 5 files changed, 260 insertions(+), 122 deletions(-) create mode 100644 Modules/index.py diff --git a/Modules/config.py b/Modules/config.py index 0b4ce34..f12a5c1 100644 --- a/Modules/config.py +++ b/Modules/config.py @@ -72,7 +72,10 @@ parser.add_argument( Folder-Name-Here", ) parser.add_argument( - "-s", "--start", default=1, help="page number to start from", + "-s", + "--start", + default=1, + help="page number to start from", ) parser.add_argument( "-S", @@ -115,7 +118,7 @@ parser.add_argument( ) parser.add_argument( "--download", - help="download a specific submission /view/12345678/", + help="download a specific submission by providing its id", ) parser.add_argument( "-jd", @@ -129,6 +132,11 @@ parser.add_argument( action="store_true", help="extract furaffinity cookies directly from your browser", ) +parser.add_argument( + "--index", + action="store_true", + help="create an index of downloaded files in an output folder", +) args = parser.parse_args() @@ -136,7 +144,7 @@ args = parser.parse_args() username = args.username category = args.category -if username != None: +if username is not None: username = username.split(" ") # Custom input @@ -153,6 +161,7 @@ folder = args.folder login = args.login check = args.check +index = args.index submissions = args.submissions json_description = args.json_description metadata = args.metadata @@ -168,3 +177,26 @@ END = "\033[0m" # Globals BASE_URL = "https://www.furaffinity.net" +username_replace_chars = { + " ": "", + "_": "", +} +search = 'YCH[a-z $-/:-?{-~!"^_`\\[\\]]*OPEN\ +|OPEN[a-z $-/:-?{-~!"^_`\\[\\]]*YCH\ +|YCH[a-z $-/:-?{-~!"^_`\\[\\]]*CLOSE\ +|CLOSE[a-z $-/:-?{-~!"^_`\\[\\]]*YCH\ +|YCH[a-z $-/:-?{-~!"^_`\\[\\]]*ABLE\ +|AVAIL[a-z $-/:-?{-~!"^_`\\[\\]]*YCH\ +|YCH[a-z $-/:-?{-~!"^_`\\[\\]]*CLONE\ +|CLONE[a-z $-/:-?{-~!"^_`\\[\\]]*YCH\ +|YCH[a-z $-/:-?{-~!"^_`\\[\\]]*LIM\ +|LIM[a-z $-/:-?{-~!"^_`\\[\\]]*YCH\ +|COM[a-z $-/:-?{-~!"^_`\\[\\]]*OPEN\ +|OPEN[a-z $-/:-?{-~!"^_`\\[\\]]*COM\ +|COM[a-z $-/:-?{-~!"^_`\\[\\]]*CLOSE[^r]\ +|CLOSE[a-z $-/:-?{-~!"^_`\\[\\]]*COM\ +|FIX[a-z $-/:-?{-~!"^_`\\[\\]]*ICE\ +|TELEGRAM[a-z $-/:-?{-~!"^_`\\[\\]]*STICK\ +|TG[a-z $-/:-?{-~!"^_`\\[\\]]*STICK\ +|REM[insder]*\\b\ +|\\bREF|\\bSale|auction|multislot|stream|adopt' diff --git a/Modules/download.py b/Modules/download.py index 0b94607..43b9b4b 100644 --- a/Modules/download.py +++ b/Modules/download.py @@ -1,12 +1,16 @@ +import http.cookiejar as cookielib import json -from tqdm import tqdm -from pathvalidate import sanitize_filename -import Modules.config as config import os + import requests from bs4 import BeautifulSoup -import http.cookiejar as cookielib -from Modules.functions import system_message_handler, check_filter, download_complete +from pathvalidate import sanitize_filename +from tqdm import tqdm + +import Modules.config as config +from Modules.functions import download_complete +from Modules.functions import requests_retry_session +from Modules.functions import system_message_handler session = requests.session() if config.cookies is not None: # add cookies if present @@ -14,8 +18,10 @@ if config.cookies is not None: # add cookies if present cookies.load() session.cookies = cookies + def download(path): - response = session.get(f"{config.BASE_URL}{path}") + + response = requests_retry_session(session=session).get(f"{config.BASE_URL}{path}") s = BeautifulSoup(response.text, "html.parser") # System messages @@ -23,78 +29,71 @@ def download(path): system_message_handler(s) image = s.find(class_="download").find("a").attrs.get("href") - title = s.find(class_="submission-title").find("p").contents[0] - title = sanitize_filename(title) - dsc = s.find(class_="submission-description").text.strip().replace("\r\n", "\n") + filename = sanitize_filename(image.split("/")[-1:][0]) - if config.json_description is True: - dsc = [] - filename = image.split("/")[-1:][0] - data = { - "id": int(path.split("/")[-2:-1][0]), - "filename": filename, - "author": s.find(class_="submission-id-sub-container") - .find("a") - .find("strong") - .text, - "date": s.find(class_="popup_date").attrs.get("title"), - "title": title, - "description": dsc, - "url": f"{config.BASE_URL}{path}", - "tags": [], - "category": s.find(class_="info").find(class_="category-name").text, - "type": s.find(class_="info").find(class_="type-name").text, - "species": s.find(class_="info").findAll("div")[2].find("span").text, - "gender": s.find(class_="info").findAll("div")[3].find("span").text, - "views": int(s.find(class_="views").find(class_="font-large").text), - "favorites": int(s.find(class_="favorites").find(class_="font-large").text), - "rating": s.find(class_="rating-box").text.strip(), - "comments": [], - } - if config.submission_filter is True and check_filter(title) is True: - print( - f'{config.WARN_COLOR}"{title}" was filtered and will not be \ -downloaded - {data.get("url")}{config.END}' - ) - return True + author = s.find(class_="submission-id-sub-container").find("a").find("strong").text + title = sanitize_filename(s.find(class_="submission-title").find("p").contents[0]) + view_id = int(path.split("/")[-2:-1][0]) + + output = f"{config.output_folder}/{author}" + rating = s.find(class_="rating-box").text.strip() + + if config.category != "gallery": + output = f"{config.output_folder}/{author}/{config.category}" + if config.folder is not None: + output = f"{config.output_folder}/{author}/{config.folder}" + os.makedirs(output, exist_ok=True) + + output_path = f"{output}/{title} ({view_id}) - {filename}" + output_path_fb = f"{output}/{title} - {filename}" + if config.rating is True: + os.makedirs(f"{output}/{rating}", exist_ok=True) + output_path = f"{output}/{rating}/{title} ({view_id}) - {filename}" + output_path_fb = f"{output}/{rating}/{title} - {filename}" + + if config.dont_redownload is True and os.path.isfile(output_path_fb): + return file_exists_fallback(author, title) image_url = f"https:{image}" - output = f"{config.output_folder}/{data.get('author')}" - if config.category != "gallery": - output = f"{config.output_folder}/{data.get('author')}/{config.category}" - if config.folder is not None: - output = f"{config.output_folder}/{data.get('author')}/{config.folder}" - os.makedirs(output, exist_ok=True) - filename = sanitize_filename(filename) - output_path = f"{output}/{title} - {filename}" - if config.rating is True: - os.makedirs(f'{output}/{data.get("rating")}', exist_ok=True) - output_path = f'{output}/{data.get("rating")}/{title} - {filename}' - - if config.dont_redownload is True and os.path.isfile(output_path): - if config.check is True: - print( - f"{config.SUCCESS_COLOR}Downloaded all recent files of \"{data.get('author')}\"{config.END}" - ) - raise download_complete - print( - f'{config.WARN_COLOR}Skipping "{title}" since it\'s already downloaded{config.END}' - ) - return True - else: - download_file( - image_url, - output_path, - f'{title} - \ -[{data.get("rating")}]', - ) + download_file( + image_url, + output_path, + f"{title} - \ +[{rating}]", + ) if config.metadata is True: + dsc = s.find(class_="submission-description").text.strip().replace("\r\n", "\n") + if config.json_description is True: + dsc = [] + data = { + "id": view_id, + "filename": filename, + "author": author, + "date": s.find(class_="popup_date").attrs.get("title"), + "title": title, + "description": dsc, + "url": f"{config.BASE_URL}{path}", + "tags": [], + "category": s.find(class_="info").find(class_="category-name").text, + "type": s.find(class_="info").find(class_="type-name").text, + "species": s.find(class_="info").findAll("div")[2].find("span").text, + "gender": s.find(class_="info").findAll("div")[3].find("span").text, + "views": int(s.find(class_="views").find(class_="font-large").text), + "favorites": int(s.find(class_="favorites").find(class_="font-large").text), + "rating": rating, + "comments": [], + } create_metadata(output, data, s, title, filename) if config.download is not None: - print(f'{config.SUCCESS_COLOR}File saved as "{output_path}" {config.END}') + print( + f'{config.SUCCESS_COLOR}File saved as \ +"{output_path}" {config.END}' + ) + return True + def download_file(url, fname, desc): try: r = session.get(url, stream=True) @@ -121,7 +120,8 @@ def download_file(url, fname, desc): os.remove(fname) exit() return True - + + def create_metadata(output, data, s, title, filename): if config.rating is True: os.makedirs(f'{output}/{data.get("rating")}/metadata', exist_ok=True) @@ -163,4 +163,18 @@ def create_metadata(output, data, s, title, filename): # Write a UTF-8 encoded JSON file for metadata with open(f"{metadata}.json", "w", encoding="utf-8") as f: - json.dump(data, f, ensure_ascii=False, indent=4) \ No newline at end of file + json.dump(data, f, ensure_ascii=False, indent=4) + + +def file_exists_fallback(author, title): + if config.check is True: + print( + f'fallback: {config.SUCCESS_COLOR}Downloaded all recent files of \ +"{author}"{config.END}' + ) + raise download_complete + print( + f'fallback: {config.WARN_COLOR}Skipping "{title}" since \ +it\'s already downloaded{config.END}' + ) + return True diff --git a/Modules/functions.py b/Modules/functions.py index 38da023..4320826 100644 --- a/Modules/functions.py +++ b/Modules/functions.py @@ -4,6 +4,8 @@ import re import browser_cookie3 import requests from bs4 import BeautifulSoup +from requests.adapters import HTTPAdapter +from urllib3.util import Retry import Modules.config as config @@ -13,39 +15,43 @@ if config.cookies is not None: # add cookies if present cookies.load() session.cookies = cookies +session.headers.update({"User-Agent": config.user_agent}) + + +def requests_retry_session( + retries=3, + backoff_factor=0.3, + status_forcelist=(500, 502, 504, 104), + session=None, +): + session = session or requests.Session() + retry = Retry( + total=retries, + read=retries, + connect=retries, + backoff_factor=backoff_factor, + status_forcelist=status_forcelist, + ) + adapter = HTTPAdapter(max_retries=retry) + session.mount("http://", adapter) + session.mount("https://", adapter) + return session + class download_complete(Exception): pass def check_filter(title): - search = 'YCH[a-z $-/:-?{-~!"^_`\\[\\]]*OPEN\ -|OPEN[a-z $-/:-?{-~!"^_`\\[\\]]*YCH\ -|YCH[a-z $-/:-?{-~!"^_`\\[\\]]*CLOSE\ -|CLOSE[a-z $-/:-?{-~!"^_`\\[\\]]*YCH\ -|YCH[a-z $-/:-?{-~!"^_`\\[\\]]*ABLE\ -|AVAIL[a-z $-/:-?{-~!"^_`\\[\\]]*YCH\ -|YCH[a-z $-/:-?{-~!"^_`\\[\\]]*CLONE\ -|CLONE[a-z $-/:-?{-~!"^_`\\[\\]]*YCH\ -|YCH[a-z $-/:-?{-~!"^_`\\[\\]]*LIM\ -|LIM[a-z $-/:-?{-~!"^_`\\[\\]]*YCH\ -|COM[a-z $-/:-?{-~!"^_`\\[\\]]*OPEN\ -|OPEN[a-z $-/:-?{-~!"^_`\\[\\]]*COM\ -|COM[a-z $-/:-?{-~!"^_`\\[\\]]*CLOSE[^r]\ -|CLOSE[a-z $-/:-?{-~!"^_`\\[\\]]*COM\ -|FIX[a-z $-/:-?{-~!"^_`\\[\\]]*ICE\ -|TELEGRAM[a-z $-/:-?{-~!"^_`\\[\\]]*STICK\ -|TG[a-z $-/:-?{-~!"^_`\\[\\]]*STICK\ -|REM[insder]*\\b\ -|\\bREF|\\bSale|auction|multislot|stream|adopt' match = re.search( - search, + config.search, title, re.IGNORECASE, ) if match is not None and title == match.string: return True + return None @@ -68,9 +74,7 @@ def system_message_handler(s): raise download_complete -def login(user_agent): - - session.headers.update({"User-Agent": user_agent}) +def login(): CJ = browser_cookie3.load() @@ -103,8 +107,6 @@ by using "-c cookies.txt"{config.END}' furaffinity in your browser, or you can export cookies.txt manually{config.END}" ) - exit() - def next_button(page_url): response = session.get(page_url) @@ -130,15 +132,17 @@ def next_button(page_url): raise download_complete page_num = next_button.parent.attrs["action"].split("/")[-2] else: + next_button = s.find("a", class_="button standard right", text="Next") page_num = fav_next_button(s) - print(f"Downloading page {page_num} - {page_url}") + print( + f"Downloading page {page_num} - {config.BASE_URL}/{next_button.parent.attrs['action']}" + ) return page_num -def fav_next_button(s): +def fav_next_button(): # unlike galleries that are sequentially numbered, favorites use a different scheme. # the "page_num" is instead: [set of numbers]/next (the trailing /next is required) - next_button = s.find("a", class_="button standard right", text="Next") if next_button is None: print(f"{config.WARN_COLOR}Unable to find next button{config.END}") raise download_complete diff --git a/Modules/index.py b/Modules/index.py new file mode 100644 index 0000000..57fc959 --- /dev/null +++ b/Modules/index.py @@ -0,0 +1,37 @@ +import contextlib +import re +from pathlib import Path + +import Modules.config as config + + +def start_indexing(path, layer=0): + """Recursively iterate over each item in path + and print item's name. + """ + + # make Path object from input string + path = Path(path) + with open(f"{config.output_folder}/index.idx", encoding="utf-8", mode="a+") as idx: + + # iter the directory + for p in path.iterdir(): + + if p.is_file(): + idx.write(f"{p}\n") + + elif p.is_dir(): + start_indexing(p, layer + 1) + + else: + raise FileNotFoundError() + + +def check_file(path): + view_id = path.split("/")[-2:-1][0] + with contextlib.suppress(FileNotFoundError): + with open(f"{config.output_folder}/index.idx", encoding="utf-8") as idx: + index = idx.read() + match = re.search(view_id, index) + if match is not None: + return True diff --git a/furaffinity-dl.py b/furaffinity-dl.py index 5118f2c..234d235 100644 --- a/furaffinity-dl.py +++ b/furaffinity-dl.py @@ -9,10 +9,14 @@ from bs4 import BeautifulSoup import Modules.config as config from Modules.download import download +from Modules.functions import check_filter from Modules.functions import download_complete from Modules.functions import login from Modules.functions import next_button +from Modules.functions import requests_retry_session from Modules.functions import system_message_handler +from Modules.index import check_file +from Modules.index import start_indexing # get session session = requests.session() @@ -31,12 +35,13 @@ def main(): while True: if config.stop == page_num: print( - f'{config.WARN_COLOR}Reached page "{config.stop}", stopping.{config.END}' + f'{config.WARN_COLOR}Reached page "{config.stop}", \ +stopping.{config.END}' ) break page_url = f"{download_url}/{page_num}" - response = session.get(page_url) + response = requests_retry_session(session=session).get(page_url) s = BeautifulSoup(response.text, "html.parser") # System messages @@ -50,7 +55,30 @@ def main(): # Download all images on the page for img in s.findAll("figure"): - download(img.find("a").attrs.get("href")) + title = img.find("figcaption").contents[0].text + img_url = img.find("a").attrs.get("href") + + if config.submission_filter is True and check_filter(title) is True: + print( + f'{config.WARN_COLOR}"{title}" was filtered and will not be \ +downloaded - {config.BASE_URL}{img_url}{config.END}' + ) + continue + + if config.dont_redownload is True and check_file(img_url) is True: + if config.check is True: + print( + f'{config.SUCCESS_COLOR}Downloaded all recent files of \ +"{config.username[0]}"{config.END}' + ) + raise download_complete + print( + f'{config.WARN_COLOR}Skipping "{title}" since \ +it\'s already downloaded{config.END}' + ) + continue + + download(img_url) sleep(config.interval) page_num = next_button(page_url) @@ -58,13 +86,18 @@ def main(): if __name__ == "__main__": if config.login is True: - login(config.user_agent) + login() + exit() + + if config.index is True: + if os.path.isfile(f"{config.output_folder}/index.idx"): + os.remove(f"{config.output_folder}/index.idx") + start_indexing(config.output_folder) + print(f"{config.SUCCESS_COLOR}indexing finished{config.END}") + exit() try: - response = session.get(config.BASE_URL) - except ConnectionError: - print(f"{config.ERROR_COLOR}Connection failed{config.END}") - exit() + response = requests_retry_session(session=session).get(config.BASE_URL) except KeyboardInterrupt: print(f"{config.WARN_COLOR}Aborted by user{config.END}") exit() @@ -72,14 +105,18 @@ if __name__ == "__main__": s = BeautifulSoup(response.text, "html.parser") if s.find(class_="loggedin_user_avatar") is not None: account_username = s.find(class_="loggedin_user_avatar").attrs.get("alt") - print(f'{config.SUCCESS_COLOR}Logged in as "{account_username}"{config.END}') + print( + f'{config.SUCCESS_COLOR}Logged in as \ +"{account_username}"{config.END}' + ) else: print( - f"{config.WARN_COLOR}Not logged in, NSFW content is inaccessible{config.END}" + f"{config.WARN_COLOR}Not logged in, NSFW content \ +is inaccessible{config.END}" ) if config.download is not None: - download(config.download) + download(f"/view/{config.download}/") exit() if config.submissions is True: @@ -109,15 +146,29 @@ downloading "{config.folder[1]}"{config.END}' ) exit() - if os.path.exists(config.username[0]): - data = open(config.username[0]).read() - config.username = filter(None, data.split("\n")) + try: + if os.path.exists(config.username[0]): + data = open(config.username[0]).read() + config.username = filter(None, data.split("\n")) + except TypeError or AttributeError: + print( + f"{config.ERROR_COLOR}Please enter a username \ +or provide a file with usernames (1 username per line){config.END}" + ) + exit() for username in config.username: - print(f'{config.SUCCESS_COLOR}Now downloading "{username}"{config.END}') - download_url = f"{config.BASE_URL}/{config.category}/{username}" - main() - print( - f'{config.SUCCESS_COLOR}Finished \ -downloading "{username}"{config.END}' + username = username.split("#")[0].translate( + str.maketrans(config.username_replace_chars) ) + if username != "": + print(f'{config.SUCCESS_COLOR}Now downloading "{username}"{config.END}') + download_url = f"{config.BASE_URL}/{config.category}/{username}" + main() + print( + f'{config.SUCCESS_COLOR}Finished \ +downloading "{username}"{config.END}' + ) + if os.path.isfile(f"{config.output_folder}/index.idx"): + os.remove(f"{config.output_folder}/index.idx") + start_indexing(config.output_folder)