1.Fix multiple crashes when download file

2.Add option to detect comission's category, also introduce new dir structure
3.Check file size when download complete to prevent corrupted file
4.Add option to enable request cache
5.Add option to force check file size to detect corrupted file
6.Fix multiple edge case that will caused exception and corrupt file
7.Fix stack overflow in some cases
8.Add max tries limit when download
This commit is contained in:
Ovear 2023-01-09 03:23:53 +08:00
parent df508cabbe
commit 976ed6f12c
5 changed files with 117 additions and 20 deletions

View file

@ -144,6 +144,25 @@ parser.add_argument(
action="store_true",
help="create an index of downloaded files in an output folder",
)
parser.add_argument(
"--real-category",
dest="real_category",
action="store_true",
help="this will download to the sub folder of its real category. it's useful when download favorites to avoid duplicate files",
)
parser.add_argument(
"--request-compress",
dest="request_compress",
action="store_true",
help="enable request compress which may save some bandwidth, but less file can be check by content-length. " +
"Since images won't be compress by default, it won't take much side effect to disable it by default",
)
parser.add_argument(
"--check-file-size",
dest="check_file_size",
action="store_true",
help="check all files size when download, this will skip build-in archive",
)
args = parser.parse_args()
@ -180,6 +199,14 @@ metadata = args.metadata
dont_redownload = args.redownload
rating = args.rating
submission_filter = args.submission_filter
real_category = args.real_category
request_compress = args.request_compress
check_file_size = args.check_file_size
if check_file_size:
request_compress = False
index = False
# Colors
SUCCESS_COLOR = "\033[1;92m"

View file

@ -11,21 +11,26 @@ from Modules.functions import requests_retry_session
from Modules.functions import system_message_handler
def download(path):
def download(path, max_retries=5):
if max_retries < 0:
return False
try:
response = requests_retry_session().get(f"{config.BASE_URL}{path}")
s = BeautifulSoup(response.text, "html.parser")
# System messages
if s.find(class_="notice-message") is not None:
system_message_handler(s)
try:
image = s.find(class_="download").find("a").attrs.get("href")
except AttributeError:
print(
f"{config.ERROR_COLOR}unsuccessful download of {config.BASE_URL}{path}{config.END}"
f"{config.ERROR_COLOR}unsuccessful download of {config.BASE_URL}{path} remains retries {max_retries}{config.END}"
)
download(path)
return True
return download(path, max_retries - 1)
except Exception as e:
print(f"{config.ERROR_COLOR}exception when download {config.BASE_URL}{path} remains retries {max_retries}, error {e}{config.END}")
return download(path, max_retries - 1)
filename = sanitize_filename(image.split("/")[-1:][0])
@ -44,6 +49,10 @@ def download(path):
output = f"{config.output_folder}/{author}"
rating = s.find(class_="rating-box").text.strip()
if config.real_category:
real_category = get_image_cateory(s)
output = f"{config.output_folder}/{author}/{real_category}"
else:
if config.category != "gallery":
output = f"{config.output_folder}/{author}/{config.category}"
if config.folder is not None:
@ -57,13 +66,21 @@ def download(path):
output_path = f"{output}/{rating}/{title} ({view_id}) - {filename}"
output_path_fb = f"{output}/{rating}/{title} - {filename}"
image_url = f"https:{image}"
if config.check_file_size and (
os.path.isfile(output_path_fb) or os.path.isfile(output_path)
):
content_length = get_content_length(image_url)
delete_file_if_mismatch_size(output_path_fb, content_length)
delete_file_if_mismatch_size(output_path, content_length)
if config.dont_redownload is True and (
os.path.isfile(output_path_fb) or os.path.isfile(output_path)
):
return file_exists_fallback(author, title, view_id)
image_url = f"https:{image}"
if (
download_file(
image_url, f"{config.BASE_URL}{path}", output_path, f"{title} - [{rating}]"
@ -74,6 +91,8 @@ def download(path):
f"{config.output_folder}/index.idx", encoding="utf-8", mode="a+"
) as idx:
idx.write(f"({view_id})\n")
else:
return download(path, max_retries - 1)
if config.metadata is True:
if config.html_description is True:
@ -120,6 +139,7 @@ def download_file(url, view_url, file_name, desc):
)
return False
total = int(r.headers.get("Content-Length", 0))
encoding = r.headers.get('Content-Encoding', '')
with open(file_name, "wb") as file, tqdm(
desc=desc.ljust(40),
total=total,
@ -135,8 +155,42 @@ def download_file(url, view_url, file_name, desc):
print(f"{config.SUCCESS_COLOR}Finished downloading{config.END}")
os.remove(file_name)
exit()
except Exception as e:
os.remove(file_name)
print(f"{config.ERROR_COLOR}Download {file_name} ({view_url}) failed, error {e}. Remove file...{config.END}")
return False
# if webserver doesn't compress file, we should check file size
if len(encoding) == 0 and delete_file_if_mismatch_size(file_name, total):
return False
return True
def get_content_length(url):
try:
with requests_retry_session().get(url, stream=True) as r:
if r.status_code != 200:
print(
f'{config.ERROR_COLOR}Got a HTTP {r.status_code} while get content length \
"{url}" ...return 0{config.END}'
)
return 0
content_length = r.headers.get("Content-Length", 0)
return int(content_length)
except Exception:
pass
return 0
def delete_file_if_mismatch_size(path, target_size):
if type(target_size) != int:
target_size = int(target_size)
if target_size <= 0 or not os.path.isfile(path):
return False
file_size = os.path.getsize(path)
if file_size != target_size:
print(f"{config.ERROR_COLOR}File size {file_size}b mismatch {target_size}b: delete file {path}{config.END}")
os.remove(path)
return True
return False
def create_metadata(output, data, s, title, filename):
if config.rating is True:
@ -183,6 +237,8 @@ def create_metadata(output, data, s, title, filename):
def file_exists_fallback(author, title, view_id):
# do not write to index when check file size is enabled
if not config.check_file_size:
with open(f"{config.output_folder}/index.idx", encoding="utf-8", mode="a+") as idx:
idx.write(f"({view_id})\n")
if config.check is True:
@ -196,3 +252,10 @@ def file_exists_fallback(author, title, view_id):
it\'s already downloaded{config.END}'
)
return True
def get_image_cateory(s):
if s.find(class_ = 'button standard mobile-fix', string = 'Main Gallery') is not None:
return 'gallery'
elif s.find(class_='button standard mobile-fix', string = 'Scraps') is not None:
return 'scraps'
return 'unknown'

View file

@ -18,6 +18,8 @@ def requests_retry_session(
):
"""Get a session, and retry in case of an error"""
session = session or requests.Session()
if not config.request_compress:
session.headers.update({'Accept-Encoding': 'identity'})
if config.cookies is not None: # add cookies if present
cookies = cookielib.MozillaCookieJar(config.cookies)
cookies.load()

View file

@ -40,6 +40,8 @@ def start_indexing(path, layer=0):
@lru_cache(maxsize=None)
def check_file(path):
"""compare file view id with index list"""
if config.check_file_size:
return False
view_id = path.split("/")[-2:-1][0]
with contextlib.suppress(FileNotFoundError):
with open(f"{config.output_folder}/index.idx", encoding="utf-8") as idx:

View file

@ -54,12 +54,15 @@ options:
--filter enable submission filter
--metadata, -m enable metadata saving
--download DOWNLOAD download a specific submission by providing its id
--html-description, -hd
download description as original html format, this won't work if json-description is enabled
--json-description, -jd
download description as a JSON list
--html-description, -hd
download description as original html format, this won't work if json-description is enabled
--login extract furaffinity cookies directly from your browser
--index create an index of downloaded files in an output folder
--real-category this will download to its real category sub folder. it's useful when download favorites to avoid duplicate files
--request-compress enable request compress which may save some bandwidth, but less file can be check by content-length. Since images won't be compress by default, it won't take much side effect to disable it by default
--check-file-size check all files size when download, this will skip build-in archive
Examples:
python3 furaffinity-dl.py koul -> will download gallery of user koul