changelog:

ability to add comments in username list with "#"
autoremoval of "_" in usernames
speedup filter checking
add basic indexing -> speedup existing file checking for newer files
other small changes
This commit is contained in:
Kentai Radiquum 2022-07-10 03:24:39 +05:00
parent 377df392e5
commit 675f558d03
No known key found for this signature in database
GPG key ID: CB1FC16C710DB347
5 changed files with 260 additions and 122 deletions

View file

@ -72,7 +72,10 @@ parser.add_argument(
Folder-Name-Here",
)
parser.add_argument(
"-s", "--start", default=1, help="page number to start from",
"-s",
"--start",
default=1,
help="page number to start from",
)
parser.add_argument(
"-S",
@ -115,7 +118,7 @@ parser.add_argument(
)
parser.add_argument(
"--download",
help="download a specific submission /view/12345678/",
help="download a specific submission by providing its id",
)
parser.add_argument(
"-jd",
@ -129,6 +132,11 @@ parser.add_argument(
action="store_true",
help="extract furaffinity cookies directly from your browser",
)
parser.add_argument(
"--index",
action="store_true",
help="create an index of downloaded files in an output folder",
)
args = parser.parse_args()
@ -136,7 +144,7 @@ args = parser.parse_args()
username = args.username
category = args.category
if username != None:
if username is not None:
username = username.split(" ")
# Custom input
@ -153,6 +161,7 @@ folder = args.folder
login = args.login
check = args.check
index = args.index
submissions = args.submissions
json_description = args.json_description
metadata = args.metadata
@ -168,3 +177,26 @@ END = "\033[0m"
# Globals
BASE_URL = "https://www.furaffinity.net"
username_replace_chars = {
" ": "",
"_": "",
}
search = 'YCH[a-z $-/:-?{-~!"^_`\\[\\]]*OPEN\
|OPEN[a-z $-/:-?{-~!"^_`\\[\\]]*YCH\
|YCH[a-z $-/:-?{-~!"^_`\\[\\]]*CLOSE\
|CLOSE[a-z $-/:-?{-~!"^_`\\[\\]]*YCH\
|YCH[a-z $-/:-?{-~!"^_`\\[\\]]*ABLE\
|AVAIL[a-z $-/:-?{-~!"^_`\\[\\]]*YCH\
|YCH[a-z $-/:-?{-~!"^_`\\[\\]]*CLONE\
|CLONE[a-z $-/:-?{-~!"^_`\\[\\]]*YCH\
|YCH[a-z $-/:-?{-~!"^_`\\[\\]]*LIM\
|LIM[a-z $-/:-?{-~!"^_`\\[\\]]*YCH\
|COM[a-z $-/:-?{-~!"^_`\\[\\]]*OPEN\
|OPEN[a-z $-/:-?{-~!"^_`\\[\\]]*COM\
|COM[a-z $-/:-?{-~!"^_`\\[\\]]*CLOSE[^r]\
|CLOSE[a-z $-/:-?{-~!"^_`\\[\\]]*COM\
|FIX[a-z $-/:-?{-~!"^_`\\[\\]]*ICE\
|TELEGRAM[a-z $-/:-?{-~!"^_`\\[\\]]*STICK\
|TG[a-z $-/:-?{-~!"^_`\\[\\]]*STICK\
|REM[insder]*\\b\
|\\bREF|\\bSale|auction|multislot|stream|adopt'

View file

@ -1,12 +1,16 @@
import http.cookiejar as cookielib
import json
from tqdm import tqdm
from pathvalidate import sanitize_filename
import Modules.config as config
import os
import requests
from bs4 import BeautifulSoup
import http.cookiejar as cookielib
from Modules.functions import system_message_handler, check_filter, download_complete
from pathvalidate import sanitize_filename
from tqdm import tqdm
import Modules.config as config
from Modules.functions import download_complete
from Modules.functions import requests_retry_session
from Modules.functions import system_message_handler
session = requests.session()
if config.cookies is not None: # add cookies if present
@ -14,8 +18,10 @@ if config.cookies is not None: # add cookies if present
cookies.load()
session.cookies = cookies
def download(path):
response = session.get(f"{config.BASE_URL}{path}")
response = requests_retry_session(session=session).get(f"{config.BASE_URL}{path}")
s = BeautifulSoup(response.text, "html.parser")
# System messages
@ -23,78 +29,71 @@ def download(path):
system_message_handler(s)
image = s.find(class_="download").find("a").attrs.get("href")
title = s.find(class_="submission-title").find("p").contents[0]
title = sanitize_filename(title)
dsc = s.find(class_="submission-description").text.strip().replace("\r\n", "\n")
filename = sanitize_filename(image.split("/")[-1:][0])
if config.json_description is True:
dsc = []
filename = image.split("/")[-1:][0]
data = {
"id": int(path.split("/")[-2:-1][0]),
"filename": filename,
"author": s.find(class_="submission-id-sub-container")
.find("a")
.find("strong")
.text,
"date": s.find(class_="popup_date").attrs.get("title"),
"title": title,
"description": dsc,
"url": f"{config.BASE_URL}{path}",
"tags": [],
"category": s.find(class_="info").find(class_="category-name").text,
"type": s.find(class_="info").find(class_="type-name").text,
"species": s.find(class_="info").findAll("div")[2].find("span").text,
"gender": s.find(class_="info").findAll("div")[3].find("span").text,
"views": int(s.find(class_="views").find(class_="font-large").text),
"favorites": int(s.find(class_="favorites").find(class_="font-large").text),
"rating": s.find(class_="rating-box").text.strip(),
"comments": [],
}
if config.submission_filter is True and check_filter(title) is True:
print(
f'{config.WARN_COLOR}"{title}" was filtered and will not be \
downloaded - {data.get("url")}{config.END}'
)
return True
author = s.find(class_="submission-id-sub-container").find("a").find("strong").text
title = sanitize_filename(s.find(class_="submission-title").find("p").contents[0])
view_id = int(path.split("/")[-2:-1][0])
output = f"{config.output_folder}/{author}"
rating = s.find(class_="rating-box").text.strip()
if config.category != "gallery":
output = f"{config.output_folder}/{author}/{config.category}"
if config.folder is not None:
output = f"{config.output_folder}/{author}/{config.folder}"
os.makedirs(output, exist_ok=True)
output_path = f"{output}/{title} ({view_id}) - {filename}"
output_path_fb = f"{output}/{title} - {filename}"
if config.rating is True:
os.makedirs(f"{output}/{rating}", exist_ok=True)
output_path = f"{output}/{rating}/{title} ({view_id}) - {filename}"
output_path_fb = f"{output}/{rating}/{title} - {filename}"
if config.dont_redownload is True and os.path.isfile(output_path_fb):
return file_exists_fallback(author, title)
image_url = f"https:{image}"
output = f"{config.output_folder}/{data.get('author')}"
if config.category != "gallery":
output = f"{config.output_folder}/{data.get('author')}/{config.category}"
if config.folder is not None:
output = f"{config.output_folder}/{data.get('author')}/{config.folder}"
os.makedirs(output, exist_ok=True)
filename = sanitize_filename(filename)
output_path = f"{output}/{title} - {filename}"
if config.rating is True:
os.makedirs(f'{output}/{data.get("rating")}', exist_ok=True)
output_path = f'{output}/{data.get("rating")}/{title} - {filename}'
if config.dont_redownload is True and os.path.isfile(output_path):
if config.check is True:
print(
f"{config.SUCCESS_COLOR}Downloaded all recent files of \"{data.get('author')}\"{config.END}"
)
raise download_complete
print(
f'{config.WARN_COLOR}Skipping "{title}" since it\'s already downloaded{config.END}'
)
return True
else:
download_file(
image_url,
output_path,
f'{title} - \
[{data.get("rating")}]',
)
download_file(
image_url,
output_path,
f"{title} - \
[{rating}]",
)
if config.metadata is True:
dsc = s.find(class_="submission-description").text.strip().replace("\r\n", "\n")
if config.json_description is True:
dsc = []
data = {
"id": view_id,
"filename": filename,
"author": author,
"date": s.find(class_="popup_date").attrs.get("title"),
"title": title,
"description": dsc,
"url": f"{config.BASE_URL}{path}",
"tags": [],
"category": s.find(class_="info").find(class_="category-name").text,
"type": s.find(class_="info").find(class_="type-name").text,
"species": s.find(class_="info").findAll("div")[2].find("span").text,
"gender": s.find(class_="info").findAll("div")[3].find("span").text,
"views": int(s.find(class_="views").find(class_="font-large").text),
"favorites": int(s.find(class_="favorites").find(class_="font-large").text),
"rating": rating,
"comments": [],
}
create_metadata(output, data, s, title, filename)
if config.download is not None:
print(f'{config.SUCCESS_COLOR}File saved as "{output_path}" {config.END}')
print(
f'{config.SUCCESS_COLOR}File saved as \
"{output_path}" {config.END}'
)
return True
def download_file(url, fname, desc):
try:
r = session.get(url, stream=True)
@ -121,7 +120,8 @@ def download_file(url, fname, desc):
os.remove(fname)
exit()
return True
def create_metadata(output, data, s, title, filename):
if config.rating is True:
os.makedirs(f'{output}/{data.get("rating")}/metadata', exist_ok=True)
@ -163,4 +163,18 @@ def create_metadata(output, data, s, title, filename):
# Write a UTF-8 encoded JSON file for metadata
with open(f"{metadata}.json", "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=4)
json.dump(data, f, ensure_ascii=False, indent=4)
def file_exists_fallback(author, title):
if config.check is True:
print(
f'fallback: {config.SUCCESS_COLOR}Downloaded all recent files of \
"{author}"{config.END}'
)
raise download_complete
print(
f'fallback: {config.WARN_COLOR}Skipping "{title}" since \
it\'s already downloaded{config.END}'
)
return True

View file

@ -4,6 +4,8 @@ import re
import browser_cookie3
import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from urllib3.util import Retry
import Modules.config as config
@ -13,39 +15,43 @@ if config.cookies is not None: # add cookies if present
cookies.load()
session.cookies = cookies
session.headers.update({"User-Agent": config.user_agent})
def requests_retry_session(
retries=3,
backoff_factor=0.3,
status_forcelist=(500, 502, 504, 104),
session=None,
):
session = session or requests.Session()
retry = Retry(
total=retries,
read=retries,
connect=retries,
backoff_factor=backoff_factor,
status_forcelist=status_forcelist,
)
adapter = HTTPAdapter(max_retries=retry)
session.mount("http://", adapter)
session.mount("https://", adapter)
return session
class download_complete(Exception):
pass
def check_filter(title):
search = 'YCH[a-z $-/:-?{-~!"^_`\\[\\]]*OPEN\
|OPEN[a-z $-/:-?{-~!"^_`\\[\\]]*YCH\
|YCH[a-z $-/:-?{-~!"^_`\\[\\]]*CLOSE\
|CLOSE[a-z $-/:-?{-~!"^_`\\[\\]]*YCH\
|YCH[a-z $-/:-?{-~!"^_`\\[\\]]*ABLE\
|AVAIL[a-z $-/:-?{-~!"^_`\\[\\]]*YCH\
|YCH[a-z $-/:-?{-~!"^_`\\[\\]]*CLONE\
|CLONE[a-z $-/:-?{-~!"^_`\\[\\]]*YCH\
|YCH[a-z $-/:-?{-~!"^_`\\[\\]]*LIM\
|LIM[a-z $-/:-?{-~!"^_`\\[\\]]*YCH\
|COM[a-z $-/:-?{-~!"^_`\\[\\]]*OPEN\
|OPEN[a-z $-/:-?{-~!"^_`\\[\\]]*COM\
|COM[a-z $-/:-?{-~!"^_`\\[\\]]*CLOSE[^r]\
|CLOSE[a-z $-/:-?{-~!"^_`\\[\\]]*COM\
|FIX[a-z $-/:-?{-~!"^_`\\[\\]]*ICE\
|TELEGRAM[a-z $-/:-?{-~!"^_`\\[\\]]*STICK\
|TG[a-z $-/:-?{-~!"^_`\\[\\]]*STICK\
|REM[insder]*\\b\
|\\bREF|\\bSale|auction|multislot|stream|adopt'
match = re.search(
search,
config.search,
title,
re.IGNORECASE,
)
if match is not None and title == match.string:
return True
return None
@ -68,9 +74,7 @@ def system_message_handler(s):
raise download_complete
def login(user_agent):
session.headers.update({"User-Agent": user_agent})
def login():
CJ = browser_cookie3.load()
@ -103,8 +107,6 @@ by using "-c cookies.txt"{config.END}'
furaffinity in your browser, or you can export cookies.txt manually{config.END}"
)
exit()
def next_button(page_url):
response = session.get(page_url)
@ -130,15 +132,17 @@ def next_button(page_url):
raise download_complete
page_num = next_button.parent.attrs["action"].split("/")[-2]
else:
next_button = s.find("a", class_="button standard right", text="Next")
page_num = fav_next_button(s)
print(f"Downloading page {page_num} - {page_url}")
print(
f"Downloading page {page_num} - {config.BASE_URL}/{next_button.parent.attrs['action']}"
)
return page_num
def fav_next_button(s):
def fav_next_button():
# unlike galleries that are sequentially numbered, favorites use a different scheme.
# the "page_num" is instead: [set of numbers]/next (the trailing /next is required)
next_button = s.find("a", class_="button standard right", text="Next")
if next_button is None:
print(f"{config.WARN_COLOR}Unable to find next button{config.END}")
raise download_complete

37
Modules/index.py Normal file
View file

@ -0,0 +1,37 @@
import contextlib
import re
from pathlib import Path
import Modules.config as config
def start_indexing(path, layer=0):
"""Recursively iterate over each item in path
and print item's name.
"""
# make Path object from input string
path = Path(path)
with open(f"{config.output_folder}/index.idx", encoding="utf-8", mode="a+") as idx:
# iter the directory
for p in path.iterdir():
if p.is_file():
idx.write(f"{p}\n")
elif p.is_dir():
start_indexing(p, layer + 1)
else:
raise FileNotFoundError()
def check_file(path):
view_id = path.split("/")[-2:-1][0]
with contextlib.suppress(FileNotFoundError):
with open(f"{config.output_folder}/index.idx", encoding="utf-8") as idx:
index = idx.read()
match = re.search(view_id, index)
if match is not None:
return True

View file

@ -9,10 +9,14 @@ from bs4 import BeautifulSoup
import Modules.config as config
from Modules.download import download
from Modules.functions import check_filter
from Modules.functions import download_complete
from Modules.functions import login
from Modules.functions import next_button
from Modules.functions import requests_retry_session
from Modules.functions import system_message_handler
from Modules.index import check_file
from Modules.index import start_indexing
# get session
session = requests.session()
@ -31,12 +35,13 @@ def main():
while True:
if config.stop == page_num:
print(
f'{config.WARN_COLOR}Reached page "{config.stop}", stopping.{config.END}'
f'{config.WARN_COLOR}Reached page "{config.stop}", \
stopping.{config.END}'
)
break
page_url = f"{download_url}/{page_num}"
response = session.get(page_url)
response = requests_retry_session(session=session).get(page_url)
s = BeautifulSoup(response.text, "html.parser")
# System messages
@ -50,7 +55,30 @@ def main():
# Download all images on the page
for img in s.findAll("figure"):
download(img.find("a").attrs.get("href"))
title = img.find("figcaption").contents[0].text
img_url = img.find("a").attrs.get("href")
if config.submission_filter is True and check_filter(title) is True:
print(
f'{config.WARN_COLOR}"{title}" was filtered and will not be \
downloaded - {config.BASE_URL}{img_url}{config.END}'
)
continue
if config.dont_redownload is True and check_file(img_url) is True:
if config.check is True:
print(
f'{config.SUCCESS_COLOR}Downloaded all recent files of \
"{config.username[0]}"{config.END}'
)
raise download_complete
print(
f'{config.WARN_COLOR}Skipping "{title}" since \
it\'s already downloaded{config.END}'
)
continue
download(img_url)
sleep(config.interval)
page_num = next_button(page_url)
@ -58,13 +86,18 @@ def main():
if __name__ == "__main__":
if config.login is True:
login(config.user_agent)
login()
exit()
if config.index is True:
if os.path.isfile(f"{config.output_folder}/index.idx"):
os.remove(f"{config.output_folder}/index.idx")
start_indexing(config.output_folder)
print(f"{config.SUCCESS_COLOR}indexing finished{config.END}")
exit()
try:
response = session.get(config.BASE_URL)
except ConnectionError:
print(f"{config.ERROR_COLOR}Connection failed{config.END}")
exit()
response = requests_retry_session(session=session).get(config.BASE_URL)
except KeyboardInterrupt:
print(f"{config.WARN_COLOR}Aborted by user{config.END}")
exit()
@ -72,14 +105,18 @@ if __name__ == "__main__":
s = BeautifulSoup(response.text, "html.parser")
if s.find(class_="loggedin_user_avatar") is not None:
account_username = s.find(class_="loggedin_user_avatar").attrs.get("alt")
print(f'{config.SUCCESS_COLOR}Logged in as "{account_username}"{config.END}')
print(
f'{config.SUCCESS_COLOR}Logged in as \
"{account_username}"{config.END}'
)
else:
print(
f"{config.WARN_COLOR}Not logged in, NSFW content is inaccessible{config.END}"
f"{config.WARN_COLOR}Not logged in, NSFW content \
is inaccessible{config.END}"
)
if config.download is not None:
download(config.download)
download(f"/view/{config.download}/")
exit()
if config.submissions is True:
@ -109,15 +146,29 @@ downloading "{config.folder[1]}"{config.END}'
)
exit()
if os.path.exists(config.username[0]):
data = open(config.username[0]).read()
config.username = filter(None, data.split("\n"))
try:
if os.path.exists(config.username[0]):
data = open(config.username[0]).read()
config.username = filter(None, data.split("\n"))
except TypeError or AttributeError:
print(
f"{config.ERROR_COLOR}Please enter a username \
or provide a file with usernames (1 username per line){config.END}"
)
exit()
for username in config.username:
print(f'{config.SUCCESS_COLOR}Now downloading "{username}"{config.END}')
download_url = f"{config.BASE_URL}/{config.category}/{username}"
main()
print(
f'{config.SUCCESS_COLOR}Finished \
downloading "{username}"{config.END}'
username = username.split("#")[0].translate(
str.maketrans(config.username_replace_chars)
)
if username != "":
print(f'{config.SUCCESS_COLOR}Now downloading "{username}"{config.END}')
download_url = f"{config.BASE_URL}/{config.category}/{username}"
main()
print(
f'{config.SUCCESS_COLOR}Finished \
downloading "{username}"{config.END}'
)
if os.path.isfile(f"{config.output_folder}/index.idx"):
os.remove(f"{config.output_folder}/index.idx")
start_indexing(config.output_folder)