Merge pull request #4 from ovear/python

Some stability and functional fix
This commit is contained in:
Kentai Radiquum 2023-01-09 17:19:09 +05:00 committed by GitHub
commit b9d958c6c1
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 121 additions and 21 deletions

View file

@ -144,6 +144,25 @@ parser.add_argument(
action="store_true",
help="create an index of downloaded files in an output folder",
)
parser.add_argument(
"--real-category",
dest="real_category",
action="store_true",
help="this will download to the sub folder of its real category. it's useful when download favorites to avoid duplicate files",
)
parser.add_argument(
"--request-compress",
dest="request_compress",
action="store_true",
help="enable request compress which may save some bandwidth, but less file can be check by content-length. " +
"Since images won't be compress by default, it won't take much side effect to disable it by default",
)
parser.add_argument(
"--check-file-size",
dest="check_file_size",
action="store_true",
help="check all files size when download, this will skip build-in archive",
)
args = parser.parse_args()
@ -180,6 +199,14 @@ metadata = args.metadata
dont_redownload = args.redownload
rating = args.rating
submission_filter = args.submission_filter
real_category = args.real_category
request_compress = args.request_compress
check_file_size = args.check_file_size
if check_file_size:
request_compress = False
index = False
# Colors
SUCCESS_COLOR = "\033[1;92m"

View file

@ -11,21 +11,26 @@ from Modules.functions import requests_retry_session
from Modules.functions import system_message_handler
def download(path):
response = requests_retry_session().get(f"{config.BASE_URL}{path}")
s = BeautifulSoup(response.text, "html.parser")
# System messages
if s.find(class_="notice-message") is not None:
system_message_handler(s)
def download(path, max_retries=5):
if max_retries < 0:
return False
try:
response = requests_retry_session().get(f"{config.BASE_URL}{path}")
s = BeautifulSoup(response.text, "html.parser")
# System messages
if s.find(class_="notice-message") is not None:
system_message_handler(s)
image = s.find(class_="download").find("a").attrs.get("href")
except AttributeError:
print(
f"{config.ERROR_COLOR}unsuccessful download of {config.BASE_URL}{path}{config.END}"
f"{config.ERROR_COLOR}unsuccessful download of {config.BASE_URL}{path} remains retries {max_retries}{config.END}"
)
download(path)
return True
return download(path, max_retries - 1)
except Exception as e:
print(f"{config.ERROR_COLOR}exception when download {config.BASE_URL}{path} remains retries {max_retries}, error {e}{config.END}")
return download(path, max_retries - 1)
filename = sanitize_filename(image.split("/")[-1:][0])
@ -44,10 +49,14 @@ def download(path):
output = f"{config.output_folder}/{author}"
rating = s.find(class_="rating-box").text.strip()
if config.category != "gallery":
output = f"{config.output_folder}/{author}/{config.category}"
if config.folder is not None:
output = f"{config.output_folder}/{author}/{config.folder}"
if config.real_category:
real_category = get_image_cateory(s)
output = f"{config.output_folder}/{author}/{real_category}"
else:
if config.category != "gallery":
output = f"{config.output_folder}/{author}/{config.category}"
if config.folder is not None:
output = f"{config.output_folder}/{author}/{config.folder}"
os.makedirs(output, exist_ok=True)
output_path = f"{output}/{title} ({view_id}) - {filename}"
@ -57,13 +66,21 @@ def download(path):
output_path = f"{output}/{rating}/{title} ({view_id}) - {filename}"
output_path_fb = f"{output}/{rating}/{title} - {filename}"
image_url = f"https:{image}"
if config.check_file_size and (
os.path.isfile(output_path_fb) or os.path.isfile(output_path)
):
content_length = get_content_length(image_url)
delete_file_if_mismatch_size(output_path_fb, content_length)
delete_file_if_mismatch_size(output_path, content_length)
if config.dont_redownload is True and (
os.path.isfile(output_path_fb) or os.path.isfile(output_path)
):
return file_exists_fallback(author, title, view_id)
image_url = f"https:{image}"
if (
download_file(
image_url, f"{config.BASE_URL}{path}", output_path, f"{title} - [{rating}]"
@ -74,6 +91,8 @@ def download(path):
f"{config.output_folder}/index.idx", encoding="utf-8", mode="a+"
) as idx:
idx.write(f"({view_id})\n")
else:
return download(path, max_retries - 1)
if config.metadata is True:
if config.html_description is True:
@ -120,6 +139,7 @@ def download_file(url, view_url, file_name, desc):
)
return False
total = int(r.headers.get("Content-Length", 0))
encoding = r.headers.get('Content-Encoding', '')
with open(file_name, "wb") as file, tqdm(
desc=desc.ljust(40),
total=total,
@ -135,8 +155,43 @@ def download_file(url, view_url, file_name, desc):
print(f"{config.SUCCESS_COLOR}Finished downloading{config.END}")
os.remove(file_name)
exit()
except Exception as e:
os.remove(file_name)
print(f"{config.ERROR_COLOR}Download {file_name} ({view_url}) failed, error {e}. Remove file...{config.END}")
return False
# if webserver doesn't compress file, we should check file size
if len(encoding) == 0 and delete_file_if_mismatch_size(file_name, total):
return False
return True
def get_content_length(url):
try:
with requests_retry_session().get(url, stream=True) as r:
if r.status_code != 200:
print(
f'{config.ERROR_COLOR}Got a HTTP {r.status_code} while get content length \
"{url}" ...return 0{config.END}'
)
return 0
content_length = r.headers.get("Content-Length", 0)
return int(content_length)
except Exception as e:
print(f'{config.ERROR_COLOR}Can not get content length for {url}...{config.END}')
pass
return 0
def delete_file_if_mismatch_size(path, target_size):
if type(target_size) != int:
target_size = int(target_size)
if target_size <= 0 or not os.path.isfile(path):
return False
file_size = os.path.getsize(path)
if file_size != target_size:
print(f"{config.ERROR_COLOR}File size {file_size}b mismatch {target_size}b: delete file {path}{config.END}")
os.remove(path)
return True
return False
def create_metadata(output, data, s, title, filename):
if config.rating is True:
@ -183,8 +238,10 @@ def create_metadata(output, data, s, title, filename):
def file_exists_fallback(author, title, view_id):
with open(f"{config.output_folder}/index.idx", encoding="utf-8", mode="a+") as idx:
idx.write(f"({view_id})\n")
# do not write to index when check file size is enabled
if not config.check_file_size:
with open(f"{config.output_folder}/index.idx", encoding="utf-8", mode="a+") as idx:
idx.write(f"({view_id})\n")
if config.check is True:
print(
f'fallback: {config.SUCCESS_COLOR}Downloaded all recent files of \
@ -196,3 +253,10 @@ def file_exists_fallback(author, title, view_id):
it\'s already downloaded{config.END}'
)
return True
def get_image_cateory(s):
if s.find(class_ = 'button standard mobile-fix', string = 'Main Gallery') is not None:
return 'gallery'
elif s.find(class_='button standard mobile-fix', string = 'Scraps') is not None:
return 'scraps'
return 'unknown'

View file

@ -18,6 +18,8 @@ def requests_retry_session(
):
"""Get a session, and retry in case of an error"""
session = session or requests.Session()
if not config.request_compress:
session.headers.update({'Accept-Encoding': 'identity'})
if config.cookies is not None: # add cookies if present
cookies = cookielib.MozillaCookieJar(config.cookies)
cookies.load()
@ -160,10 +162,10 @@ def fav_next_button(parse_next_button):
print(f"{config.WARN_COLOR}Unable to find next button{config.END}")
raise DownloadComplete
next_page_link = parse_next_button.attrs["href"]
next_fav_num = re.search(r"\d+", next_page_link)
next_fav_num = re.findall(r"\d+", next_page_link)
if next_fav_num is None:
if len(next_fav_num) <= 0:
print(f"{config.WARN_COLOR}Failed to parse next favorite link{config.END}")
raise DownloadComplete
return f"{next_fav_num[0]}/next"
return f"{next_fav_num[-1]}/next"

View file

@ -40,6 +40,8 @@ def start_indexing(path, layer=0):
@lru_cache(maxsize=None)
def check_file(path):
"""compare file view id with index list"""
if config.check_file_size:
return False
view_id = path.split("/")[-2:-1][0]
with contextlib.suppress(FileNotFoundError):
with open(f"{config.output_folder}/index.idx", encoding="utf-8") as idx:

View file

@ -58,8 +58,13 @@ options:
download description as original html format, this won't work if json-description is enabled
--json-description, -jd
download description as a JSON list
--html-description, -hd
download description as original html format, this won't work if json-description is enabled
--login extract furaffinity cookies directly from your browser
--index create an index of downloaded files in an output folder
--real-category this will download to its real category sub folder. it's useful when download favorites to avoid duplicate files
--request-compress enable request compress which may save some bandwidth, but less file can be check by content-length. Since images won't be compress by default, it won't take much side effect to disable it by default
--check-file-size check all files size when download, this will skip build-in archive
Examples:
python3 furaffinity-dl.py koul -> will download gallery of user koul