From bfc1b00a4e978f3ee1a7efd0d6b1c7ee769eb651 Mon Sep 17 00:00:00 2001 From: Ovear Date: Thu, 29 Sep 2022 16:42:29 +0800 Subject: [PATCH 1/4] Update README for html description option --- Modules/config.py | 2 +- README.md | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/Modules/config.py b/Modules/config.py index 4f32cb0..c727748 100644 --- a/Modules/config.py +++ b/Modules/config.py @@ -132,7 +132,7 @@ parser.add_argument( "-hd", dest="html_description", action="store_true", - help="save description in original html format", + help="download description as original html format, this won't work if json-description is enabled", ) parser.add_argument( "--login", diff --git a/README.md b/README.md index aa8ea89..9185520 100644 --- a/README.md +++ b/README.md @@ -54,6 +54,8 @@ options: --filter enable submission filter --metadata, -m enable metadata saving --download DOWNLOAD download a specific submission by providing its id + --html-description, -hd + download description as original html format, this won't work if json-description is enabled --json-description, -jd download description as a JSON list --login extract furaffinity cookies directly from your browser From df508cabbe2d1fe1e29491a3547a1b4d343595ac Mon Sep 17 00:00:00 2001 From: Ovear Date: Sat, 7 Jan 2023 03:03:38 +0800 Subject: [PATCH 2/4] fix fav next button page match --- Modules/functions.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Modules/functions.py b/Modules/functions.py index 0f1d82d..7e8817e 100644 --- a/Modules/functions.py +++ b/Modules/functions.py @@ -160,10 +160,10 @@ def fav_next_button(parse_next_button): print(f"{config.WARN_COLOR}Unable to find next button{config.END}") raise DownloadComplete next_page_link = parse_next_button.attrs["href"] - next_fav_num = re.search(r"\d+", next_page_link) + next_fav_num = re.findall(r"\d+", next_page_link) - if next_fav_num is None: + if len(next_fav_num) <= 0: print(f"{config.WARN_COLOR}Failed to parse next favorite link{config.END}") raise DownloadComplete - return f"{next_fav_num[0]}/next" + return f"{next_fav_num[-1]}/next" From 976ed6f12c13ffebfa97e4857f323cf02b635813 Mon Sep 17 00:00:00 2001 From: Ovear Date: Mon, 9 Jan 2023 03:23:53 +0800 Subject: [PATCH 3/4] 1.Fix multiple crashes when download file 2.Add option to detect comission's category, also introduce new dir structure 3.Check file size when download complete to prevent corrupted file 4.Add option to enable request cache 5.Add option to force check file size to detect corrupted file 6.Fix multiple edge case that will caused exception and corrupt file 7.Fix stack overflow in some cases 8.Add max tries limit when download --- Modules/config.py | 27 ++++++++++++ Modules/download.py | 99 ++++++++++++++++++++++++++++++++++++-------- Modules/functions.py | 2 + Modules/index.py | 2 + README.md | 7 +++- 5 files changed, 117 insertions(+), 20 deletions(-) diff --git a/Modules/config.py b/Modules/config.py index c727748..98b864f 100644 --- a/Modules/config.py +++ b/Modules/config.py @@ -144,6 +144,25 @@ parser.add_argument( action="store_true", help="create an index of downloaded files in an output folder", ) +parser.add_argument( + "--real-category", + dest="real_category", + action="store_true", + help="this will download to the sub folder of its real category. it's useful when download favorites to avoid duplicate files", +) +parser.add_argument( + "--request-compress", + dest="request_compress", + action="store_true", + help="enable request compress which may save some bandwidth, but less file can be check by content-length. " + + "Since images won't be compress by default, it won't take much side effect to disable it by default", +) +parser.add_argument( + "--check-file-size", + dest="check_file_size", + action="store_true", + help="check all files size when download, this will skip build-in archive", +) args = parser.parse_args() @@ -180,6 +199,14 @@ metadata = args.metadata dont_redownload = args.redownload rating = args.rating submission_filter = args.submission_filter +real_category = args.real_category +request_compress = args.request_compress +check_file_size = args.check_file_size + +if check_file_size: + request_compress = False + index = False + # Colors SUCCESS_COLOR = "\033[1;92m" diff --git a/Modules/download.py b/Modules/download.py index b2b26ec..c2138af 100644 --- a/Modules/download.py +++ b/Modules/download.py @@ -11,21 +11,26 @@ from Modules.functions import requests_retry_session from Modules.functions import system_message_handler -def download(path): - response = requests_retry_session().get(f"{config.BASE_URL}{path}") - s = BeautifulSoup(response.text, "html.parser") - - # System messages - if s.find(class_="notice-message") is not None: - system_message_handler(s) +def download(path, max_retries=5): + if max_retries < 0: + return False try: + response = requests_retry_session().get(f"{config.BASE_URL}{path}") + + s = BeautifulSoup(response.text, "html.parser") + + # System messages + if s.find(class_="notice-message") is not None: + system_message_handler(s) image = s.find(class_="download").find("a").attrs.get("href") except AttributeError: print( - f"{config.ERROR_COLOR}unsuccessful download of {config.BASE_URL}{path}{config.END}" + f"{config.ERROR_COLOR}unsuccessful download of {config.BASE_URL}{path} remains retries {max_retries}{config.END}" ) - download(path) - return True + return download(path, max_retries - 1) + except Exception as e: + print(f"{config.ERROR_COLOR}exception when download {config.BASE_URL}{path} remains retries {max_retries}, error {e}{config.END}") + return download(path, max_retries - 1) filename = sanitize_filename(image.split("/")[-1:][0]) @@ -44,10 +49,14 @@ def download(path): output = f"{config.output_folder}/{author}" rating = s.find(class_="rating-box").text.strip() - if config.category != "gallery": - output = f"{config.output_folder}/{author}/{config.category}" - if config.folder is not None: - output = f"{config.output_folder}/{author}/{config.folder}" + if config.real_category: + real_category = get_image_cateory(s) + output = f"{config.output_folder}/{author}/{real_category}" + else: + if config.category != "gallery": + output = f"{config.output_folder}/{author}/{config.category}" + if config.folder is not None: + output = f"{config.output_folder}/{author}/{config.folder}" os.makedirs(output, exist_ok=True) output_path = f"{output}/{title} ({view_id}) - {filename}" @@ -57,13 +66,21 @@ def download(path): output_path = f"{output}/{rating}/{title} ({view_id}) - {filename}" output_path_fb = f"{output}/{rating}/{title} - {filename}" + image_url = f"https:{image}" + + if config.check_file_size and ( + os.path.isfile(output_path_fb) or os.path.isfile(output_path) + ): + content_length = get_content_length(image_url) + delete_file_if_mismatch_size(output_path_fb, content_length) + delete_file_if_mismatch_size(output_path, content_length) + + if config.dont_redownload is True and ( os.path.isfile(output_path_fb) or os.path.isfile(output_path) ): return file_exists_fallback(author, title, view_id) - image_url = f"https:{image}" - if ( download_file( image_url, f"{config.BASE_URL}{path}", output_path, f"{title} - [{rating}]" @@ -74,6 +91,8 @@ def download(path): f"{config.output_folder}/index.idx", encoding="utf-8", mode="a+" ) as idx: idx.write(f"({view_id})\n") + else: + return download(path, max_retries - 1) if config.metadata is True: if config.html_description is True: @@ -120,6 +139,7 @@ def download_file(url, view_url, file_name, desc): ) return False total = int(r.headers.get("Content-Length", 0)) + encoding = r.headers.get('Content-Encoding', '') with open(file_name, "wb") as file, tqdm( desc=desc.ljust(40), total=total, @@ -135,8 +155,42 @@ def download_file(url, view_url, file_name, desc): print(f"{config.SUCCESS_COLOR}Finished downloading{config.END}") os.remove(file_name) exit() + except Exception as e: + os.remove(file_name) + print(f"{config.ERROR_COLOR}Download {file_name} ({view_url}) failed, error {e}. Remove file...{config.END}") + return False + + # if webserver doesn't compress file, we should check file size + if len(encoding) == 0 and delete_file_if_mismatch_size(file_name, total): + return False return True +def get_content_length(url): + try: + with requests_retry_session().get(url, stream=True) as r: + if r.status_code != 200: + print( + f'{config.ERROR_COLOR}Got a HTTP {r.status_code} while get content length \ + "{url}" ...return 0{config.END}' + ) + return 0 + content_length = r.headers.get("Content-Length", 0) + return int(content_length) + except Exception: + pass + return 0 + +def delete_file_if_mismatch_size(path, target_size): + if type(target_size) != int: + target_size = int(target_size) + if target_size <= 0 or not os.path.isfile(path): + return False + file_size = os.path.getsize(path) + if file_size != target_size: + print(f"{config.ERROR_COLOR}File size {file_size}b mismatch {target_size}b: delete file {path}{config.END}") + os.remove(path) + return True + return False def create_metadata(output, data, s, title, filename): if config.rating is True: @@ -183,8 +237,10 @@ def create_metadata(output, data, s, title, filename): def file_exists_fallback(author, title, view_id): - with open(f"{config.output_folder}/index.idx", encoding="utf-8", mode="a+") as idx: - idx.write(f"({view_id})\n") + # do not write to index when check file size is enabled + if not config.check_file_size: + with open(f"{config.output_folder}/index.idx", encoding="utf-8", mode="a+") as idx: + idx.write(f"({view_id})\n") if config.check is True: print( f'fallback: {config.SUCCESS_COLOR}Downloaded all recent files of \ @@ -196,3 +252,10 @@ def file_exists_fallback(author, title, view_id): it\'s already downloaded{config.END}' ) return True + +def get_image_cateory(s): + if s.find(class_ = 'button standard mobile-fix', string = 'Main Gallery') is not None: + return 'gallery' + elif s.find(class_='button standard mobile-fix', string = 'Scraps') is not None: + return 'scraps' + return 'unknown' \ No newline at end of file diff --git a/Modules/functions.py b/Modules/functions.py index 7e8817e..cea829e 100644 --- a/Modules/functions.py +++ b/Modules/functions.py @@ -18,6 +18,8 @@ def requests_retry_session( ): """Get a session, and retry in case of an error""" session = session or requests.Session() + if not config.request_compress: + session.headers.update({'Accept-Encoding': 'identity'}) if config.cookies is not None: # add cookies if present cookies = cookielib.MozillaCookieJar(config.cookies) cookies.load() diff --git a/Modules/index.py b/Modules/index.py index 016aa78..2156e4a 100644 --- a/Modules/index.py +++ b/Modules/index.py @@ -40,6 +40,8 @@ def start_indexing(path, layer=0): @lru_cache(maxsize=None) def check_file(path): """compare file view id with index list""" + if config.check_file_size: + return False view_id = path.split("/")[-2:-1][0] with contextlib.suppress(FileNotFoundError): with open(f"{config.output_folder}/index.idx", encoding="utf-8") as idx: diff --git a/README.md b/README.md index 9185520..f8130c5 100644 --- a/README.md +++ b/README.md @@ -54,12 +54,15 @@ options: --filter enable submission filter --metadata, -m enable metadata saving --download DOWNLOAD download a specific submission by providing its id - --html-description, -hd - download description as original html format, this won't work if json-description is enabled --json-description, -jd download description as a JSON list + --html-description, -hd + download description as original html format, this won't work if json-description is enabled --login extract furaffinity cookies directly from your browser --index create an index of downloaded files in an output folder + --real-category this will download to its real category sub folder. it's useful when download favorites to avoid duplicate files + --request-compress enable request compress which may save some bandwidth, but less file can be check by content-length. Since images won't be compress by default, it won't take much side effect to disable it by default + --check-file-size check all files size when download, this will skip build-in archive Examples: python3 furaffinity-dl.py koul -> will download gallery of user koul From a43882456d0a449d9809ffd6226a4076929904dd Mon Sep 17 00:00:00 2001 From: Ovear Date: Mon, 9 Jan 2023 03:44:16 +0800 Subject: [PATCH 4/4] Add log when get content length for url failed. --- Modules/download.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Modules/download.py b/Modules/download.py index c2138af..288ae08 100644 --- a/Modules/download.py +++ b/Modules/download.py @@ -176,7 +176,8 @@ def get_content_length(url): return 0 content_length = r.headers.get("Content-Length", 0) return int(content_length) - except Exception: + except Exception as e: + print(f'{config.ERROR_COLOR}Can not get content length for {url}...{config.END}') pass return 0