changelog:

ability to add comments in username list with "#"
autoremoval of "_" in usernames
speedup filter checking
add basic indexing -> speedup existing file checking for newer files
other small changes
This commit is contained in:
Kentai Radiquum 2022-07-10 03:24:39 +05:00
parent 377df392e5
commit 675f558d03
No known key found for this signature in database
GPG key ID: CB1FC16C710DB347
5 changed files with 260 additions and 122 deletions

View file

@ -72,7 +72,10 @@ parser.add_argument(
Folder-Name-Here", Folder-Name-Here",
) )
parser.add_argument( parser.add_argument(
"-s", "--start", default=1, help="page number to start from", "-s",
"--start",
default=1,
help="page number to start from",
) )
parser.add_argument( parser.add_argument(
"-S", "-S",
@ -115,7 +118,7 @@ parser.add_argument(
) )
parser.add_argument( parser.add_argument(
"--download", "--download",
help="download a specific submission /view/12345678/", help="download a specific submission by providing its id",
) )
parser.add_argument( parser.add_argument(
"-jd", "-jd",
@ -129,6 +132,11 @@ parser.add_argument(
action="store_true", action="store_true",
help="extract furaffinity cookies directly from your browser", help="extract furaffinity cookies directly from your browser",
) )
parser.add_argument(
"--index",
action="store_true",
help="create an index of downloaded files in an output folder",
)
args = parser.parse_args() args = parser.parse_args()
@ -136,7 +144,7 @@ args = parser.parse_args()
username = args.username username = args.username
category = args.category category = args.category
if username != None: if username is not None:
username = username.split(" ") username = username.split(" ")
# Custom input # Custom input
@ -153,6 +161,7 @@ folder = args.folder
login = args.login login = args.login
check = args.check check = args.check
index = args.index
submissions = args.submissions submissions = args.submissions
json_description = args.json_description json_description = args.json_description
metadata = args.metadata metadata = args.metadata
@ -168,3 +177,26 @@ END = "\033[0m"
# Globals # Globals
BASE_URL = "https://www.furaffinity.net" BASE_URL = "https://www.furaffinity.net"
username_replace_chars = {
" ": "",
"_": "",
}
search = 'YCH[a-z $-/:-?{-~!"^_`\\[\\]]*OPEN\
|OPEN[a-z $-/:-?{-~!"^_`\\[\\]]*YCH\
|YCH[a-z $-/:-?{-~!"^_`\\[\\]]*CLOSE\
|CLOSE[a-z $-/:-?{-~!"^_`\\[\\]]*YCH\
|YCH[a-z $-/:-?{-~!"^_`\\[\\]]*ABLE\
|AVAIL[a-z $-/:-?{-~!"^_`\\[\\]]*YCH\
|YCH[a-z $-/:-?{-~!"^_`\\[\\]]*CLONE\
|CLONE[a-z $-/:-?{-~!"^_`\\[\\]]*YCH\
|YCH[a-z $-/:-?{-~!"^_`\\[\\]]*LIM\
|LIM[a-z $-/:-?{-~!"^_`\\[\\]]*YCH\
|COM[a-z $-/:-?{-~!"^_`\\[\\]]*OPEN\
|OPEN[a-z $-/:-?{-~!"^_`\\[\\]]*COM\
|COM[a-z $-/:-?{-~!"^_`\\[\\]]*CLOSE[^r]\
|CLOSE[a-z $-/:-?{-~!"^_`\\[\\]]*COM\
|FIX[a-z $-/:-?{-~!"^_`\\[\\]]*ICE\
|TELEGRAM[a-z $-/:-?{-~!"^_`\\[\\]]*STICK\
|TG[a-z $-/:-?{-~!"^_`\\[\\]]*STICK\
|REM[insder]*\\b\
|\\bREF|\\bSale|auction|multislot|stream|adopt'

View file

@ -1,12 +1,16 @@
import http.cookiejar as cookielib
import json import json
from tqdm import tqdm
from pathvalidate import sanitize_filename
import Modules.config as config
import os import os
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import http.cookiejar as cookielib from pathvalidate import sanitize_filename
from Modules.functions import system_message_handler, check_filter, download_complete from tqdm import tqdm
import Modules.config as config
from Modules.functions import download_complete
from Modules.functions import requests_retry_session
from Modules.functions import system_message_handler
session = requests.session() session = requests.session()
if config.cookies is not None: # add cookies if present if config.cookies is not None: # add cookies if present
@ -14,8 +18,10 @@ if config.cookies is not None: # add cookies if present
cookies.load() cookies.load()
session.cookies = cookies session.cookies = cookies
def download(path): def download(path):
response = session.get(f"{config.BASE_URL}{path}")
response = requests_retry_session(session=session).get(f"{config.BASE_URL}{path}")
s = BeautifulSoup(response.text, "html.parser") s = BeautifulSoup(response.text, "html.parser")
# System messages # System messages
@ -23,78 +29,71 @@ def download(path):
system_message_handler(s) system_message_handler(s)
image = s.find(class_="download").find("a").attrs.get("href") image = s.find(class_="download").find("a").attrs.get("href")
title = s.find(class_="submission-title").find("p").contents[0] filename = sanitize_filename(image.split("/")[-1:][0])
title = sanitize_filename(title)
dsc = s.find(class_="submission-description").text.strip().replace("\r\n", "\n")
if config.json_description is True: author = s.find(class_="submission-id-sub-container").find("a").find("strong").text
dsc = [] title = sanitize_filename(s.find(class_="submission-title").find("p").contents[0])
filename = image.split("/")[-1:][0] view_id = int(path.split("/")[-2:-1][0])
data = {
"id": int(path.split("/")[-2:-1][0]), output = f"{config.output_folder}/{author}"
"filename": filename, rating = s.find(class_="rating-box").text.strip()
"author": s.find(class_="submission-id-sub-container")
.find("a") if config.category != "gallery":
.find("strong") output = f"{config.output_folder}/{author}/{config.category}"
.text, if config.folder is not None:
"date": s.find(class_="popup_date").attrs.get("title"), output = f"{config.output_folder}/{author}/{config.folder}"
"title": title, os.makedirs(output, exist_ok=True)
"description": dsc,
"url": f"{config.BASE_URL}{path}", output_path = f"{output}/{title} ({view_id}) - {filename}"
"tags": [], output_path_fb = f"{output}/{title} - {filename}"
"category": s.find(class_="info").find(class_="category-name").text, if config.rating is True:
"type": s.find(class_="info").find(class_="type-name").text, os.makedirs(f"{output}/{rating}", exist_ok=True)
"species": s.find(class_="info").findAll("div")[2].find("span").text, output_path = f"{output}/{rating}/{title} ({view_id}) - {filename}"
"gender": s.find(class_="info").findAll("div")[3].find("span").text, output_path_fb = f"{output}/{rating}/{title} - {filename}"
"views": int(s.find(class_="views").find(class_="font-large").text),
"favorites": int(s.find(class_="favorites").find(class_="font-large").text), if config.dont_redownload is True and os.path.isfile(output_path_fb):
"rating": s.find(class_="rating-box").text.strip(), return file_exists_fallback(author, title)
"comments": [],
}
if config.submission_filter is True and check_filter(title) is True:
print(
f'{config.WARN_COLOR}"{title}" was filtered and will not be \
downloaded - {data.get("url")}{config.END}'
)
return True
image_url = f"https:{image}" image_url = f"https:{image}"
output = f"{config.output_folder}/{data.get('author')}" download_file(
if config.category != "gallery": image_url,
output = f"{config.output_folder}/{data.get('author')}/{config.category}" output_path,
if config.folder is not None: f"{title} - \
output = f"{config.output_folder}/{data.get('author')}/{config.folder}" [{rating}]",
os.makedirs(output, exist_ok=True) )
filename = sanitize_filename(filename)
output_path = f"{output}/{title} - {filename}"
if config.rating is True:
os.makedirs(f'{output}/{data.get("rating")}', exist_ok=True)
output_path = f'{output}/{data.get("rating")}/{title} - {filename}'
if config.dont_redownload is True and os.path.isfile(output_path):
if config.check is True:
print(
f"{config.SUCCESS_COLOR}Downloaded all recent files of \"{data.get('author')}\"{config.END}"
)
raise download_complete
print(
f'{config.WARN_COLOR}Skipping "{title}" since it\'s already downloaded{config.END}'
)
return True
else:
download_file(
image_url,
output_path,
f'{title} - \
[{data.get("rating")}]',
)
if config.metadata is True: if config.metadata is True:
dsc = s.find(class_="submission-description").text.strip().replace("\r\n", "\n")
if config.json_description is True:
dsc = []
data = {
"id": view_id,
"filename": filename,
"author": author,
"date": s.find(class_="popup_date").attrs.get("title"),
"title": title,
"description": dsc,
"url": f"{config.BASE_URL}{path}",
"tags": [],
"category": s.find(class_="info").find(class_="category-name").text,
"type": s.find(class_="info").find(class_="type-name").text,
"species": s.find(class_="info").findAll("div")[2].find("span").text,
"gender": s.find(class_="info").findAll("div")[3].find("span").text,
"views": int(s.find(class_="views").find(class_="font-large").text),
"favorites": int(s.find(class_="favorites").find(class_="font-large").text),
"rating": rating,
"comments": [],
}
create_metadata(output, data, s, title, filename) create_metadata(output, data, s, title, filename)
if config.download is not None: if config.download is not None:
print(f'{config.SUCCESS_COLOR}File saved as "{output_path}" {config.END}') print(
f'{config.SUCCESS_COLOR}File saved as \
"{output_path}" {config.END}'
)
return True return True
def download_file(url, fname, desc): def download_file(url, fname, desc):
try: try:
r = session.get(url, stream=True) r = session.get(url, stream=True)
@ -121,7 +120,8 @@ def download_file(url, fname, desc):
os.remove(fname) os.remove(fname)
exit() exit()
return True return True
def create_metadata(output, data, s, title, filename): def create_metadata(output, data, s, title, filename):
if config.rating is True: if config.rating is True:
os.makedirs(f'{output}/{data.get("rating")}/metadata', exist_ok=True) os.makedirs(f'{output}/{data.get("rating")}/metadata', exist_ok=True)
@ -163,4 +163,18 @@ def create_metadata(output, data, s, title, filename):
# Write a UTF-8 encoded JSON file for metadata # Write a UTF-8 encoded JSON file for metadata
with open(f"{metadata}.json", "w", encoding="utf-8") as f: with open(f"{metadata}.json", "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=4) json.dump(data, f, ensure_ascii=False, indent=4)
def file_exists_fallback(author, title):
if config.check is True:
print(
f'fallback: {config.SUCCESS_COLOR}Downloaded all recent files of \
"{author}"{config.END}'
)
raise download_complete
print(
f'fallback: {config.WARN_COLOR}Skipping "{title}" since \
it\'s already downloaded{config.END}'
)
return True

View file

@ -4,6 +4,8 @@ import re
import browser_cookie3 import browser_cookie3
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from urllib3.util import Retry
import Modules.config as config import Modules.config as config
@ -13,39 +15,43 @@ if config.cookies is not None: # add cookies if present
cookies.load() cookies.load()
session.cookies = cookies session.cookies = cookies
session.headers.update({"User-Agent": config.user_agent})
def requests_retry_session(
retries=3,
backoff_factor=0.3,
status_forcelist=(500, 502, 504, 104),
session=None,
):
session = session or requests.Session()
retry = Retry(
total=retries,
read=retries,
connect=retries,
backoff_factor=backoff_factor,
status_forcelist=status_forcelist,
)
adapter = HTTPAdapter(max_retries=retry)
session.mount("http://", adapter)
session.mount("https://", adapter)
return session
class download_complete(Exception): class download_complete(Exception):
pass pass
def check_filter(title): def check_filter(title):
search = 'YCH[a-z $-/:-?{-~!"^_`\\[\\]]*OPEN\
|OPEN[a-z $-/:-?{-~!"^_`\\[\\]]*YCH\
|YCH[a-z $-/:-?{-~!"^_`\\[\\]]*CLOSE\
|CLOSE[a-z $-/:-?{-~!"^_`\\[\\]]*YCH\
|YCH[a-z $-/:-?{-~!"^_`\\[\\]]*ABLE\
|AVAIL[a-z $-/:-?{-~!"^_`\\[\\]]*YCH\
|YCH[a-z $-/:-?{-~!"^_`\\[\\]]*CLONE\
|CLONE[a-z $-/:-?{-~!"^_`\\[\\]]*YCH\
|YCH[a-z $-/:-?{-~!"^_`\\[\\]]*LIM\
|LIM[a-z $-/:-?{-~!"^_`\\[\\]]*YCH\
|COM[a-z $-/:-?{-~!"^_`\\[\\]]*OPEN\
|OPEN[a-z $-/:-?{-~!"^_`\\[\\]]*COM\
|COM[a-z $-/:-?{-~!"^_`\\[\\]]*CLOSE[^r]\
|CLOSE[a-z $-/:-?{-~!"^_`\\[\\]]*COM\
|FIX[a-z $-/:-?{-~!"^_`\\[\\]]*ICE\
|TELEGRAM[a-z $-/:-?{-~!"^_`\\[\\]]*STICK\
|TG[a-z $-/:-?{-~!"^_`\\[\\]]*STICK\
|REM[insder]*\\b\
|\\bREF|\\bSale|auction|multislot|stream|adopt'
match = re.search( match = re.search(
search, config.search,
title, title,
re.IGNORECASE, re.IGNORECASE,
) )
if match is not None and title == match.string: if match is not None and title == match.string:
return True return True
return None return None
@ -68,9 +74,7 @@ def system_message_handler(s):
raise download_complete raise download_complete
def login(user_agent): def login():
session.headers.update({"User-Agent": user_agent})
CJ = browser_cookie3.load() CJ = browser_cookie3.load()
@ -103,8 +107,6 @@ by using "-c cookies.txt"{config.END}'
furaffinity in your browser, or you can export cookies.txt manually{config.END}" furaffinity in your browser, or you can export cookies.txt manually{config.END}"
) )
exit()
def next_button(page_url): def next_button(page_url):
response = session.get(page_url) response = session.get(page_url)
@ -130,15 +132,17 @@ def next_button(page_url):
raise download_complete raise download_complete
page_num = next_button.parent.attrs["action"].split("/")[-2] page_num = next_button.parent.attrs["action"].split("/")[-2]
else: else:
next_button = s.find("a", class_="button standard right", text="Next")
page_num = fav_next_button(s) page_num = fav_next_button(s)
print(f"Downloading page {page_num} - {page_url}") print(
f"Downloading page {page_num} - {config.BASE_URL}/{next_button.parent.attrs['action']}"
)
return page_num return page_num
def fav_next_button(s): def fav_next_button():
# unlike galleries that are sequentially numbered, favorites use a different scheme. # unlike galleries that are sequentially numbered, favorites use a different scheme.
# the "page_num" is instead: [set of numbers]/next (the trailing /next is required) # the "page_num" is instead: [set of numbers]/next (the trailing /next is required)
next_button = s.find("a", class_="button standard right", text="Next")
if next_button is None: if next_button is None:
print(f"{config.WARN_COLOR}Unable to find next button{config.END}") print(f"{config.WARN_COLOR}Unable to find next button{config.END}")
raise download_complete raise download_complete

37
Modules/index.py Normal file
View file

@ -0,0 +1,37 @@
import contextlib
import re
from pathlib import Path
import Modules.config as config
def start_indexing(path, layer=0):
"""Recursively iterate over each item in path
and print item's name.
"""
# make Path object from input string
path = Path(path)
with open(f"{config.output_folder}/index.idx", encoding="utf-8", mode="a+") as idx:
# iter the directory
for p in path.iterdir():
if p.is_file():
idx.write(f"{p}\n")
elif p.is_dir():
start_indexing(p, layer + 1)
else:
raise FileNotFoundError()
def check_file(path):
view_id = path.split("/")[-2:-1][0]
with contextlib.suppress(FileNotFoundError):
with open(f"{config.output_folder}/index.idx", encoding="utf-8") as idx:
index = idx.read()
match = re.search(view_id, index)
if match is not None:
return True

View file

@ -9,10 +9,14 @@ from bs4 import BeautifulSoup
import Modules.config as config import Modules.config as config
from Modules.download import download from Modules.download import download
from Modules.functions import check_filter
from Modules.functions import download_complete from Modules.functions import download_complete
from Modules.functions import login from Modules.functions import login
from Modules.functions import next_button from Modules.functions import next_button
from Modules.functions import requests_retry_session
from Modules.functions import system_message_handler from Modules.functions import system_message_handler
from Modules.index import check_file
from Modules.index import start_indexing
# get session # get session
session = requests.session() session = requests.session()
@ -31,12 +35,13 @@ def main():
while True: while True:
if config.stop == page_num: if config.stop == page_num:
print( print(
f'{config.WARN_COLOR}Reached page "{config.stop}", stopping.{config.END}' f'{config.WARN_COLOR}Reached page "{config.stop}", \
stopping.{config.END}'
) )
break break
page_url = f"{download_url}/{page_num}" page_url = f"{download_url}/{page_num}"
response = session.get(page_url) response = requests_retry_session(session=session).get(page_url)
s = BeautifulSoup(response.text, "html.parser") s = BeautifulSoup(response.text, "html.parser")
# System messages # System messages
@ -50,7 +55,30 @@ def main():
# Download all images on the page # Download all images on the page
for img in s.findAll("figure"): for img in s.findAll("figure"):
download(img.find("a").attrs.get("href")) title = img.find("figcaption").contents[0].text
img_url = img.find("a").attrs.get("href")
if config.submission_filter is True and check_filter(title) is True:
print(
f'{config.WARN_COLOR}"{title}" was filtered and will not be \
downloaded - {config.BASE_URL}{img_url}{config.END}'
)
continue
if config.dont_redownload is True and check_file(img_url) is True:
if config.check is True:
print(
f'{config.SUCCESS_COLOR}Downloaded all recent files of \
"{config.username[0]}"{config.END}'
)
raise download_complete
print(
f'{config.WARN_COLOR}Skipping "{title}" since \
it\'s already downloaded{config.END}'
)
continue
download(img_url)
sleep(config.interval) sleep(config.interval)
page_num = next_button(page_url) page_num = next_button(page_url)
@ -58,13 +86,18 @@ def main():
if __name__ == "__main__": if __name__ == "__main__":
if config.login is True: if config.login is True:
login(config.user_agent) login()
exit()
if config.index is True:
if os.path.isfile(f"{config.output_folder}/index.idx"):
os.remove(f"{config.output_folder}/index.idx")
start_indexing(config.output_folder)
print(f"{config.SUCCESS_COLOR}indexing finished{config.END}")
exit()
try: try:
response = session.get(config.BASE_URL) response = requests_retry_session(session=session).get(config.BASE_URL)
except ConnectionError:
print(f"{config.ERROR_COLOR}Connection failed{config.END}")
exit()
except KeyboardInterrupt: except KeyboardInterrupt:
print(f"{config.WARN_COLOR}Aborted by user{config.END}") print(f"{config.WARN_COLOR}Aborted by user{config.END}")
exit() exit()
@ -72,14 +105,18 @@ if __name__ == "__main__":
s = BeautifulSoup(response.text, "html.parser") s = BeautifulSoup(response.text, "html.parser")
if s.find(class_="loggedin_user_avatar") is not None: if s.find(class_="loggedin_user_avatar") is not None:
account_username = s.find(class_="loggedin_user_avatar").attrs.get("alt") account_username = s.find(class_="loggedin_user_avatar").attrs.get("alt")
print(f'{config.SUCCESS_COLOR}Logged in as "{account_username}"{config.END}') print(
f'{config.SUCCESS_COLOR}Logged in as \
"{account_username}"{config.END}'
)
else: else:
print( print(
f"{config.WARN_COLOR}Not logged in, NSFW content is inaccessible{config.END}" f"{config.WARN_COLOR}Not logged in, NSFW content \
is inaccessible{config.END}"
) )
if config.download is not None: if config.download is not None:
download(config.download) download(f"/view/{config.download}/")
exit() exit()
if config.submissions is True: if config.submissions is True:
@ -109,15 +146,29 @@ downloading "{config.folder[1]}"{config.END}'
) )
exit() exit()
if os.path.exists(config.username[0]): try:
data = open(config.username[0]).read() if os.path.exists(config.username[0]):
config.username = filter(None, data.split("\n")) data = open(config.username[0]).read()
config.username = filter(None, data.split("\n"))
except TypeError or AttributeError:
print(
f"{config.ERROR_COLOR}Please enter a username \
or provide a file with usernames (1 username per line){config.END}"
)
exit()
for username in config.username: for username in config.username:
print(f'{config.SUCCESS_COLOR}Now downloading "{username}"{config.END}') username = username.split("#")[0].translate(
download_url = f"{config.BASE_URL}/{config.category}/{username}" str.maketrans(config.username_replace_chars)
main()
print(
f'{config.SUCCESS_COLOR}Finished \
downloading "{username}"{config.END}'
) )
if username != "":
print(f'{config.SUCCESS_COLOR}Now downloading "{username}"{config.END}')
download_url = f"{config.BASE_URL}/{config.category}/{username}"
main()
print(
f'{config.SUCCESS_COLOR}Finished \
downloading "{username}"{config.END}'
)
if os.path.isfile(f"{config.output_folder}/index.idx"):
os.remove(f"{config.output_folder}/index.idx")
start_indexing(config.output_folder)