changelog:

ability to add comments in username list with "#" autoremoval of "_" in usernames speedup filter checking add basic indexing -> speedup existing file checking for newer files other small changes
2025-09-03 21:15:34 +05:00 · 2022-07-10 03:24:39 +05:00 · 2022-07-10 03:24:39 +05:00 · 675f558d03
commit 675f558d03
parent 377df392e5
5 changed files with 260 additions and 122 deletions
--- a/Modules/config.py
+++ b/Modules/config.py
@ -72,7 +72,10 @@ parser.add_argument(
 Folder-Name-Here",
 )
 parser.add_argument(
-    "-s", "--start", default=1, help="page number to start from",
+    "-s",
+    "--start",
+    default=1,
+    help="page number to start from",
 )
 parser.add_argument(
    "-S",
@ -115,7 +118,7 @@ parser.add_argument(
 )
 parser.add_argument(
    "--download",
-    help="download a specific submission /view/12345678/",
+    help="download a specific submission by providing its id",
 )
 parser.add_argument(
    "-jd",
@ -129,6 +132,11 @@ parser.add_argument(
    action="store_true",
    help="extract furaffinity cookies directly from your browser",
 )
+parser.add_argument(
+    "--index",
+    action="store_true",
+    help="create an index of downloaded files in an output folder",
+)

 args = parser.parse_args()

@ -136,7 +144,7 @@ args = parser.parse_args()
 username = args.username
 category = args.category

-if username != None:
+if username is not None:
    username = username.split(" ")

 # Custom input
@ -153,6 +161,7 @@ folder = args.folder

 login = args.login
 check = args.check
+index = args.index
 submissions = args.submissions
 json_description = args.json_description
 metadata = args.metadata
@ -168,3 +177,26 @@ END = "\033[0m"

 # Globals
 BASE_URL = "https://www.furaffinity.net"
+username_replace_chars = {
+    " ": "",
+    "_": "",
+}
+search = 'YCH[a-z $-/:-?{-~!"^_`\\[\\]]*OPEN\
+|OPEN[a-z $-/:-?{-~!"^_`\\[\\]]*YCH\
+|YCH[a-z $-/:-?{-~!"^_`\\[\\]]*CLOSE\
+|CLOSE[a-z $-/:-?{-~!"^_`\\[\\]]*YCH\
+|YCH[a-z $-/:-?{-~!"^_`\\[\\]]*ABLE\
+|AVAIL[a-z $-/:-?{-~!"^_`\\[\\]]*YCH\
+|YCH[a-z $-/:-?{-~!"^_`\\[\\]]*CLONE\
+|CLONE[a-z $-/:-?{-~!"^_`\\[\\]]*YCH\
+|YCH[a-z $-/:-?{-~!"^_`\\[\\]]*LIM\
+|LIM[a-z $-/:-?{-~!"^_`\\[\\]]*YCH\
+|COM[a-z $-/:-?{-~!"^_`\\[\\]]*OPEN\
+|OPEN[a-z $-/:-?{-~!"^_`\\[\\]]*COM\
+|COM[a-z $-/:-?{-~!"^_`\\[\\]]*CLOSE[^r]\
+|CLOSE[a-z $-/:-?{-~!"^_`\\[\\]]*COM\
+|FIX[a-z $-/:-?{-~!"^_`\\[\\]]*ICE\
+|TELEGRAM[a-z $-/:-?{-~!"^_`\\[\\]]*STICK\
+|TG[a-z $-/:-?{-~!"^_`\\[\\]]*STICK\
+|REM[insder]*\\b\
+|\\bREF|\\bSale|auction|multislot|stream|adopt'
--- a/Modules/download.py
+++ b/Modules/download.py
@ -1,12 +1,16 @@
+import http.cookiejar as cookielib
 import json
-from tqdm import tqdm
-from pathvalidate import sanitize_filename
-import Modules.config as config
 import os
+
 import requests
 from bs4 import BeautifulSoup
-import http.cookiejar as cookielib
-from Modules.functions import system_message_handler, check_filter, download_complete
+from pathvalidate import sanitize_filename
+from tqdm import tqdm
+
+import Modules.config as config
+from Modules.functions import download_complete
+from Modules.functions import requests_retry_session
+from Modules.functions import system_message_handler

 session = requests.session()
 if config.cookies is not None:  # add cookies if present
@ -14,8 +18,10 @@ if config.cookies is not None:  # add cookies if present
    cookies.load()
    session.cookies = cookies

+
 def download(path):
-    response = session.get(f"{config.BASE_URL}{path}")
+
+    response = requests_retry_session(session=session).get(f"{config.BASE_URL}{path}")
    s = BeautifulSoup(response.text, "html.parser")

    # System messages
@ -23,78 +29,71 @@ def download(path):
        system_message_handler(s)

    image = s.find(class_="download").find("a").attrs.get("href")
-    title = s.find(class_="submission-title").find("p").contents[0]
-    title = sanitize_filename(title)
-    dsc = s.find(class_="submission-description").text.strip().replace("\r\n", "\n")
+    filename = sanitize_filename(image.split("/")[-1:][0])

-    if config.json_description is True:
-        dsc = []
-    filename = image.split("/")[-1:][0]
-    data = {
-        "id": int(path.split("/")[-2:-1][0]),
-        "filename": filename,
-        "author": s.find(class_="submission-id-sub-container")
-        .find("a")
-        .find("strong")
-        .text,
-        "date": s.find(class_="popup_date").attrs.get("title"),
-        "title": title,
-        "description": dsc,
-        "url": f"{config.BASE_URL}{path}",
-        "tags": [],
-        "category": s.find(class_="info").find(class_="category-name").text,
-        "type": s.find(class_="info").find(class_="type-name").text,
-        "species": s.find(class_="info").findAll("div")[2].find("span").text,
-        "gender": s.find(class_="info").findAll("div")[3].find("span").text,
-        "views": int(s.find(class_="views").find(class_="font-large").text),
-        "favorites": int(s.find(class_="favorites").find(class_="font-large").text),
-        "rating": s.find(class_="rating-box").text.strip(),
-        "comments": [],
-    }
-    if config.submission_filter is True and check_filter(title) is True:
-        print(
-            f'{config.WARN_COLOR}"{title}" was filtered and will not be \
-downloaded - {data.get("url")}{config.END}'
-        )
-        return True
+    author = s.find(class_="submission-id-sub-container").find("a").find("strong").text
+    title = sanitize_filename(s.find(class_="submission-title").find("p").contents[0])
+    view_id = int(path.split("/")[-2:-1][0])
+
+    output = f"{config.output_folder}/{author}"
+    rating = s.find(class_="rating-box").text.strip()
+
+    if config.category != "gallery":
+        output = f"{config.output_folder}/{author}/{config.category}"
+    if config.folder is not None:
+        output = f"{config.output_folder}/{author}/{config.folder}"
+    os.makedirs(output, exist_ok=True)
+
+    output_path = f"{output}/{title} ({view_id}) - {filename}"
+    output_path_fb = f"{output}/{title} - {filename}"
+    if config.rating is True:
+        os.makedirs(f"{output}/{rating}", exist_ok=True)
+        output_path = f"{output}/{rating}/{title} ({view_id}) - {filename}"
+        output_path_fb = f"{output}/{rating}/{title} - {filename}"
+
+    if config.dont_redownload is True and os.path.isfile(output_path_fb):
+        return file_exists_fallback(author, title)

    image_url = f"https:{image}"
-    output = f"{config.output_folder}/{data.get('author')}"
-    if config.category != "gallery":
-        output = f"{config.output_folder}/{data.get('author')}/{config.category}"
-    if config.folder is not None:
-        output = f"{config.output_folder}/{data.get('author')}/{config.folder}"
-    os.makedirs(output, exist_ok=True)
-    filename = sanitize_filename(filename)
-    output_path = f"{output}/{title} - {filename}"
-    if config.rating is True:
-        os.makedirs(f'{output}/{data.get("rating")}', exist_ok=True)
-        output_path = f'{output}/{data.get("rating")}/{title} - {filename}'
-
-    if config.dont_redownload is True and os.path.isfile(output_path):
-        if config.check is True:
-            print(
-                f"{config.SUCCESS_COLOR}Downloaded all recent files of \"{data.get('author')}\"{config.END}"
-            )
-            raise download_complete
-        print(
-            f'{config.WARN_COLOR}Skipping "{title}" since it\'s already downloaded{config.END}'
-        )
-        return True
-    else:
-        download_file(
-            image_url,
-            output_path,
-            f'{title} - \
-[{data.get("rating")}]',
-        )
+    download_file(
+        image_url,
+        output_path,
+        f"{title} - \
+[{rating}]",
+    )

    if config.metadata is True:
+        dsc = s.find(class_="submission-description").text.strip().replace("\r\n", "\n")
+        if config.json_description is True:
+            dsc = []
+        data = {
+            "id": view_id,
+            "filename": filename,
+            "author": author,
+            "date": s.find(class_="popup_date").attrs.get("title"),
+            "title": title,
+            "description": dsc,
+            "url": f"{config.BASE_URL}{path}",
+            "tags": [],
+            "category": s.find(class_="info").find(class_="category-name").text,
+            "type": s.find(class_="info").find(class_="type-name").text,
+            "species": s.find(class_="info").findAll("div")[2].find("span").text,
+            "gender": s.find(class_="info").findAll("div")[3].find("span").text,
+            "views": int(s.find(class_="views").find(class_="font-large").text),
+            "favorites": int(s.find(class_="favorites").find(class_="font-large").text),
+            "rating": rating,
+            "comments": [],
+        }
        create_metadata(output, data, s, title, filename)
    if config.download is not None:
-        print(f'{config.SUCCESS_COLOR}File saved as "{output_path}" {config.END}')
+        print(
+            f'{config.SUCCESS_COLOR}File saved as \
+"{output_path}" {config.END}'
+        )
+
    return True

+
 def download_file(url, fname, desc):
    try:
        r = session.get(url, stream=True)
@ -121,7 +120,8 @@ def download_file(url, fname, desc):
        os.remove(fname)
        exit()
    return True
-    
+
+
 def create_metadata(output, data, s, title, filename):
    if config.rating is True:
        os.makedirs(f'{output}/{data.get("rating")}/metadata', exist_ok=True)
@ -163,4 +163,18 @@ def create_metadata(output, data, s, title, filename):

    # Write a UTF-8 encoded JSON file for metadata
    with open(f"{metadata}.json", "w", encoding="utf-8") as f:
-        json.dump(data, f, ensure_ascii=False, indent=4)
+        json.dump(data, f, ensure_ascii=False, indent=4)
+
+
+def file_exists_fallback(author, title):
+    if config.check is True:
+        print(
+            f'fallback: {config.SUCCESS_COLOR}Downloaded all recent files of \
+"{author}"{config.END}'
+        )
+        raise download_complete
+    print(
+        f'fallback: {config.WARN_COLOR}Skipping "{title}" since \
+it\'s already downloaded{config.END}'
+    )
+    return True
--- a/Modules/functions.py
+++ b/Modules/functions.py
@ -4,6 +4,8 @@ import re
 import browser_cookie3
 import requests
 from bs4 import BeautifulSoup
+from requests.adapters import HTTPAdapter
+from urllib3.util import Retry

 import Modules.config as config

@ -13,39 +15,43 @@ if config.cookies is not None:  # add cookies if present
    cookies.load()
    session.cookies = cookies

+session.headers.update({"User-Agent": config.user_agent})
+
+
+def requests_retry_session(
+    retries=3,
+    backoff_factor=0.3,
+    status_forcelist=(500, 502, 504, 104),
+    session=None,
+):
+    session = session or requests.Session()
+    retry = Retry(
+        total=retries,
+        read=retries,
+        connect=retries,
+        backoff_factor=backoff_factor,
+        status_forcelist=status_forcelist,
+    )
+    adapter = HTTPAdapter(max_retries=retry)
+    session.mount("http://", adapter)
+    session.mount("https://", adapter)
+    return session
+

 class download_complete(Exception):
    pass


 def check_filter(title):
-    search = 'YCH[a-z $-/:-?{-~!"^_`\\[\\]]*OPEN\
-|OPEN[a-z $-/:-?{-~!"^_`\\[\\]]*YCH\
-|YCH[a-z $-/:-?{-~!"^_`\\[\\]]*CLOSE\
-|CLOSE[a-z $-/:-?{-~!"^_`\\[\\]]*YCH\
-|YCH[a-z $-/:-?{-~!"^_`\\[\\]]*ABLE\
-|AVAIL[a-z $-/:-?{-~!"^_`\\[\\]]*YCH\
-|YCH[a-z $-/:-?{-~!"^_`\\[\\]]*CLONE\
-|CLONE[a-z $-/:-?{-~!"^_`\\[\\]]*YCH\
-|YCH[a-z $-/:-?{-~!"^_`\\[\\]]*LIM\
-|LIM[a-z $-/:-?{-~!"^_`\\[\\]]*YCH\
-|COM[a-z $-/:-?{-~!"^_`\\[\\]]*OPEN\
-|OPEN[a-z $-/:-?{-~!"^_`\\[\\]]*COM\
-|COM[a-z $-/:-?{-~!"^_`\\[\\]]*CLOSE[^r]\
-|CLOSE[a-z $-/:-?{-~!"^_`\\[\\]]*COM\
-|FIX[a-z $-/:-?{-~!"^_`\\[\\]]*ICE\
-|TELEGRAM[a-z $-/:-?{-~!"^_`\\[\\]]*STICK\
-|TG[a-z $-/:-?{-~!"^_`\\[\\]]*STICK\
-|REM[insder]*\\b\
-|\\bREF|\\bSale|auction|multislot|stream|adopt'

    match = re.search(
-        search,
+        config.search,
        title,
        re.IGNORECASE,
    )
    if match is not None and title == match.string:
        return True
+
    return None


@ -68,9 +74,7 @@ def system_message_handler(s):
    raise download_complete


-def login(user_agent):
-
-    session.headers.update({"User-Agent": user_agent})
+def login():

    CJ = browser_cookie3.load()

@ -103,8 +107,6 @@ by using "-c cookies.txt"{config.END}'
 furaffinity in your browser, or you can export cookies.txt manually{config.END}"
        )

-    exit()
-

 def next_button(page_url):
    response = session.get(page_url)
@ -130,15 +132,17 @@ def next_button(page_url):
            raise download_complete
        page_num = next_button.parent.attrs["action"].split("/")[-2]
    else:
+        next_button = s.find("a", class_="button standard right", text="Next")
        page_num = fav_next_button(s)
-    print(f"Downloading page {page_num} - {page_url}")
+    print(
+        f"Downloading page {page_num} - {config.BASE_URL}/{next_button.parent.attrs['action']}"
+    )
    return page_num


-def fav_next_button(s):
+def fav_next_button():
    # unlike galleries that are sequentially numbered, favorites use a different scheme.
    # the "page_num" is instead: [set of numbers]/next (the trailing /next is required)
-    next_button = s.find("a", class_="button standard right", text="Next")
    if next_button is None:
        print(f"{config.WARN_COLOR}Unable to find next button{config.END}")
        raise download_complete
--- a/Modules/index.py
+++ b/Modules/index.py
@ -0,0 +1,37 @@
+import contextlib
+import re
+from pathlib import Path
+
+import Modules.config as config
+
+
+def start_indexing(path, layer=0):
+    """Recursively iterate over each item in path
+    and print item's name.
+    """
+
+    # make Path object from input string
+    path = Path(path)
+    with open(f"{config.output_folder}/index.idx", encoding="utf-8", mode="a+") as idx:
+
+        # iter the directory
+        for p in path.iterdir():
+
+            if p.is_file():
+                idx.write(f"{p}\n")
+
+            elif p.is_dir():
+                start_indexing(p, layer + 1)
+
+            else:
+                raise FileNotFoundError()
+
+
+def check_file(path):
+    view_id = path.split("/")[-2:-1][0]
+    with contextlib.suppress(FileNotFoundError):
+        with open(f"{config.output_folder}/index.idx", encoding="utf-8") as idx:
+            index = idx.read()
+            match = re.search(view_id, index)
+        if match is not None:
+            return True
--- a/furaffinity-dl.py
+++ b/furaffinity-dl.py
@ -9,10 +9,14 @@ from bs4 import BeautifulSoup

 import Modules.config as config
 from Modules.download import download
+from Modules.functions import check_filter
 from Modules.functions import download_complete
 from Modules.functions import login
 from Modules.functions import next_button
+from Modules.functions import requests_retry_session
 from Modules.functions import system_message_handler
+from Modules.index import check_file
+from Modules.index import start_indexing

 # get session
 session = requests.session()
@ -31,12 +35,13 @@ def main():
        while True:
            if config.stop == page_num:
                print(
-                    f'{config.WARN_COLOR}Reached page "{config.stop}", stopping.{config.END}'
+                    f'{config.WARN_COLOR}Reached page "{config.stop}", \
+stopping.{config.END}'
                )
                break

            page_url = f"{download_url}/{page_num}"
-            response = session.get(page_url)
+            response = requests_retry_session(session=session).get(page_url)
            s = BeautifulSoup(response.text, "html.parser")

            # System messages
@ -50,7 +55,30 @@ def main():

            # Download all images on the page
            for img in s.findAll("figure"):
-                download(img.find("a").attrs.get("href"))
+                title = img.find("figcaption").contents[0].text
+                img_url = img.find("a").attrs.get("href")
+
+                if config.submission_filter is True and check_filter(title) is True:
+                    print(
+                        f'{config.WARN_COLOR}"{title}" was filtered and will not be \
+downloaded - {config.BASE_URL}{img_url}{config.END}'
+                    )
+                    continue
+
+                if config.dont_redownload is True and check_file(img_url) is True:
+                    if config.check is True:
+                        print(
+                            f'{config.SUCCESS_COLOR}Downloaded all recent files of \
+"{config.username[0]}"{config.END}'
+                        )
+                        raise download_complete
+                    print(
+                        f'{config.WARN_COLOR}Skipping "{title}" since \
+it\'s already downloaded{config.END}'
+                    )
+                    continue
+
+                download(img_url)
                sleep(config.interval)

            page_num = next_button(page_url)
@ -58,13 +86,18 @@ def main():

 if __name__ == "__main__":
    if config.login is True:
-        login(config.user_agent)
+        login()
+        exit()
+
+    if config.index is True:
+        if os.path.isfile(f"{config.output_folder}/index.idx"):
+            os.remove(f"{config.output_folder}/index.idx")
+        start_indexing(config.output_folder)
+        print(f"{config.SUCCESS_COLOR}indexing finished{config.END}")
+        exit()

    try:
-        response = session.get(config.BASE_URL)
-    except ConnectionError:
-        print(f"{config.ERROR_COLOR}Connection failed{config.END}")
-        exit()
+        response = requests_retry_session(session=session).get(config.BASE_URL)
    except KeyboardInterrupt:
        print(f"{config.WARN_COLOR}Aborted by user{config.END}")
        exit()
@ -72,14 +105,18 @@ if __name__ == "__main__":
    s = BeautifulSoup(response.text, "html.parser")
    if s.find(class_="loggedin_user_avatar") is not None:
        account_username = s.find(class_="loggedin_user_avatar").attrs.get("alt")
-        print(f'{config.SUCCESS_COLOR}Logged in as "{account_username}"{config.END}')
+        print(
+            f'{config.SUCCESS_COLOR}Logged in as \
+"{account_username}"{config.END}'
+        )
    else:
        print(
-            f"{config.WARN_COLOR}Not logged in, NSFW content is inaccessible{config.END}"
+            f"{config.WARN_COLOR}Not logged in, NSFW content \
+is inaccessible{config.END}"
        )

    if config.download is not None:
-        download(config.download)
+        download(f"/view/{config.download}/")
        exit()

    if config.submissions is True:
@ -109,15 +146,29 @@ downloading "{config.folder[1]}"{config.END}'
        )
        exit()

-    if os.path.exists(config.username[0]):
-        data = open(config.username[0]).read()
-        config.username = filter(None, data.split("\n"))
+    try:
+        if os.path.exists(config.username[0]):
+            data = open(config.username[0]).read()
+            config.username = filter(None, data.split("\n"))
+    except TypeError or AttributeError:
+        print(
+            f"{config.ERROR_COLOR}Please enter a username \
+or provide a file with usernames (1 username per line){config.END}"
+        )
+        exit()

    for username in config.username:
-        print(f'{config.SUCCESS_COLOR}Now downloading "{username}"{config.END}')
-        download_url = f"{config.BASE_URL}/{config.category}/{username}"
-        main()
-        print(
-            f'{config.SUCCESS_COLOR}Finished \
-downloading "{username}"{config.END}'
+        username = username.split("#")[0].translate(
+            str.maketrans(config.username_replace_chars)
        )
+        if username != "":
+            print(f'{config.SUCCESS_COLOR}Now downloading "{username}"{config.END}')
+            download_url = f"{config.BASE_URL}/{config.category}/{username}"
+            main()
+            print(
+                f'{config.SUCCESS_COLOR}Finished \
+downloading "{username}"{config.END}'
+            )
+    if os.path.isfile(f"{config.output_folder}/index.idx"):
+        os.remove(f"{config.output_folder}/index.idx")
+    start_indexing(config.output_folder)