"rewrited" some part of the code

Create directory tree based on author name Filter posts like "YCH OPEN, REMINDER" argument parser changes
2025-09-17 03:43:52 +05:00 · 2022-06-16 06:41:26 +05:00 · 2022-06-16 06:41:26 +05:00 · 6e51fffed7
commit 6e51fffed7
parent 77760e7135
5 changed files with 376 additions and 211 deletions
--- a/furaffinity-dl.py
+++ b/furaffinity-dl.py
@ -1,99 +1,161 @@
 #!/usr/bin/python3
 import argparse
-from types import NoneType
-from tqdm import tqdm
-from argparse import RawTextHelpFormatter
-import json
-from bs4 import BeautifulSoup
-import requests
 import http.cookiejar as cookielib
-import re
+import json
 import os
+import re
 from time import sleep

-'''
-Please refer to LICENSE for licensing conditions.
-'''
+import requests
+from bs4 import BeautifulSoup
+from tqdm import tqdm

 # Argument parsing
-parser = argparse.ArgumentParser(formatter_class=RawTextHelpFormatter, description='Downloads the entire gallery/scraps/favorites of a furaffinity user, or your submissions', epilog='''
+parser = argparse.ArgumentParser(
+    formatter_class=argparse.RawTextHelpFormatter,
+    description="Downloads the entire gallery/scraps/favorites of a furaffinity user, or your submissions",
+    epilog="""
 Examples:
- python3 furaffinity-dl.py gallery koul
+ python3 furaffinity-dl.py koul koul_gallery
 python3 furaffinity-dl.py -o koulsArt gallery koul
- python3 furaffinity-dl.py -o mylasFavs favorites mylafox\n
+ python3 furaffinity-dl.py -o mylasFavs --category favorites mylafox\n
 You can also log in to FurAffinity in a web browser and load cookies to download Age restricted content or Submissions:
 python3 furaffinity-dl.py -c cookies.txt gallery letodoesart
- python3 furaffinity-dl.py -c cookies.txt msg submissions\n
+ python3 furaffinity-dl.py -c cookies.txt --submissions\n
 DISCLAIMER: It is your own responsibility to check whether batch downloading is allowed by FurAffinity terms of service and to abide by them.
-''')
-parser.add_argument('category', metavar='category', type=str, nargs='?', default='gallery', help='the category to download, gallery/scraps/favorites')
-parser.add_argument('username', metavar='username', type=str, nargs='?', help='username of the furaffinity user')
-parser.add_argument('folder', metavar='folder', type=str, nargs='?', help='name of the folder (full path, for instance 123456/Folder-Name-Here)')
-parser.add_argument('--output', '-o', dest='output', type=str, default='.', help="output directory")
-parser.add_argument('--cookies', '-c', dest='cookies', type=str, default='', help="path to a NetScape cookies file")
-parser.add_argument('--ua', '-u', dest='ua', type=str, default='Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.7) Gecko/20100101 Firefox/68.7', help="Your browser's useragent, may be required, depending on your luck")
-parser.add_argument('--start', '-s', dest='start', type=str, default=1, help="page number to start from")
-parser.add_argument('--stop', '-S', dest='stop', type=str, default='', help="Page number to stop on. For favorites pages, specify the full URL after the username (1234567890/next).")
-parser.add_argument('--dont-redownload', '-d', const='dont_redownload', action='store_const', help="Don't redownload files that have already been downloaded")
-parser.add_argument('--interval', '-i', dest='interval', type=float, default=0, help="delay between downloading pages")
-parser.add_argument('--metadir', '-m', dest='metadir', type=str, default=None, help="directory to put meta files in")
+""",
+)
+
+# General stuff
+parser.add_argument(
+    "--category",
+    "-ca",
+    type=str,
+    nargs="?",
+    help="the category to download, gallery/scraps/favorites [default: gallery]",
+    const=1,
+    default="gallery",
+)
+parser.add_argument(
+    "--submissions",
+    "-su",
+    action="store_true",
+    help="download your submissions",
+)
+parser.add_argument(
+    "username",
+    type=str,
+    help="username of the furaffinity user [required]",
+)
+parser.add_argument(
+    "--folder",
+    type=str,
+    help="full path of the furaffinity folder. for instance 123456/Folder-Name-Here",
+)
+parser.add_argument(
+    "--output", type=str, default="furaffinity-dl", help="output directory [default: furaffinity-dl]"
+)
+parser.add_argument(
+    "--cookies",
+    "-c",
+    dest="cookies",
+    type=str,
+    help="path to a NetScape cookies file",
+)
+parser.add_argument(
+    "--user-agent",
+    "-u",
+    dest="ua",
+    type=str,
+    nargs="?",
+    default="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.7) Gecko/20100101 Firefox/68.7",
+    help="Your browser's useragent, may be required, depending on your luck",
+)
+parser.add_argument(
+    "--start", "-s", type=str, default=1, help="page number to start from", nargs="?"
+)
+parser.add_argument(
+    "--stop",
+    "-S",
+    dest="stop",
+    type=str,
+    help="Page number to stop on. For favorites pages, specify the full URL after the username (1234567890/next).",
+)
+parser.add_argument(
+    "--dont-redownload",
+    "-d",
+    action="store_false",
+    help="Don't redownload files that have already been downloaded [default: true]",
+)
+parser.add_argument(
+    "--interval",
+    "-i",
+    dest="interval",
+    type=float,
+    default=0,
+    help="delay between downloading pages in seconds [default: 0]",
+)
+parser.add_argument(
+    "--rating",
+    "-r",
+    action="store_false",
+    help="enable rating separation [default: true]",
+)
+parser.add_argument(
+    "--metadata",
+    "-m",
+    action="store_true",
+    help="enable downloading of metadata [default: false]",
+)

 args = parser.parse_args()
+
+BASE_URL = "https://www.furaffinity.net"
+categories = {
+    "gallery": "gallery",
+    "scraps": "scraps",
+    "favorites": "favorites",
+}
+category = categories.get(args.category)
+if category is None:
+    print("please enter a valid category")
+    exit()
 if args.username is None:
-    parser.print_help()
+    print("please enter a FA Username")
+    exit()
+if args.output is None:
+    print("please enter a output folder")
    exit()

-# Create output directory if it doesn't exist
-if args.output != '.':
-    os.makedirs(args.output, exist_ok=True)
-    
-if args.metadir == None:
-    args.metadir = args.output
-else:
-    os.makedirs(args.metadir, exist_ok=True)
+username = args.username
+output = f"{args.output}/{args.username}"
+metadata = f"{output}/metadata"

+filter = {"YCH Open", "Reminder", "YCH Closed", "Auction"}

-# Check validity of category
-valid_categories = ['gallery', 'favorites', 'scraps', 'msg']
-if args.category not in valid_categories:
-    raise Exception('Category is not valid', args.category)
-
-# Check validity of username
-if bool(re.compile(r'[^a-zA-Z0-9\-~._]').search(args.username)):
-    raise Exception('Username contains non-valid characters', args.username)
-
-# Initialise a session
 session = requests.session()
-session.headers.update({'User-Agent': args.ua})
+session.headers.update({"User-Agent": args.ua})

-# Load cookies from a netscape cookie file (if provided)
-if args.cookies != '':
+if args.cookies is not None:
    cookies = cookielib.MozillaCookieJar(args.cookies)
    cookies.load()
    session.cookies = cookies

-base_url = 'https://www.furaffinity.net'
-gallery_url = '{}/{}/{}'.format(base_url, args.category, args.username)
-if args.folder is not None:
-    gallery_url += "/folder/"
-    gallery_url += args.folder
-page_num = args.start
-

 def download_file(url, fname, desc):
    r = session.get(url, stream=True)
    if r.status_code != 200:
-        print("Got a HTTP {} while downloading {}; skipping".format(r.status_code, fname))
+        print(f"Got a HTTP {r.status_code} while downloading {fname}; ...skipping")
        return False

-    total = int(r.headers.get('Content-Length', 0))
-    with open(fname, 'wb') as file, tqdm(
+    total = int(r.headers.get("Content-Length", 0))
+    with open(fname, "wb") as file, tqdm(
        desc=desc.ljust(40)[:40],
        total=total,
        miniters=100,
-        unit='b',
+        unit="b",
        unit_scale=True,
-        unit_divisor=1024
+        unit_divisor=1024,
    ) as bar:
        for data in r.iter_content(chunk_size=1024):
            size = file.write(data)
@ -101,161 +163,212 @@ def download_file(url, fname, desc):
    return True


-# The cursed function that handles downloading
+download_url = f"{BASE_URL}/{category}/{username}"
+if args.folder is not None:
+    download_url = f"{BASE_URL}/gallery/{username}/folder/{args.folder}"
+if args.submissions is True:
+    download_url = f"{BASE_URL}/msg/submissions"
+
+
 def download(path):
-    page_url = '{}{}'.format(base_url, path)
-    response = session.get(page_url)
-    s = BeautifulSoup(response.text, 'html.parser')
+    response = session.get(f"{BASE_URL}{path}")
+    s = BeautifulSoup(response.text, "html.parser")

    # System messages
-    if s.find(class_='notice-message') is not None:
-        message = s.find(class_='notice-message').find('div').find(class_="link-override").text.strip()
-        raise Exception('System Message', message)
+    if s.find(class_="notice-message") is not None:
+        message = (
+            s.find(class_="notice-message")
+            .find("div")
+            .find(class_="link-override")
+            .text.strip()
+        )
+        raise Exception("System Message", message)

-    image = s.find(class_='download').find('a').attrs.get('href')
-    title = s.find(class_='submission-title').find('p').contents[0]
+    image = s.find(class_="download").find("a").attrs.get("href")
+    title = s.find(class_="submission-title").find("p").contents[0]
    filename = image.split("/")[-1:][0]
    data = {
-        'id': int(path.split('/')[-2:-1][0]),
-        'filename': filename,
-        'author': s.find(class_='submission-id-sub-container').find('a').find('strong').text,
-        'date': s.find(class_='popup_date').attrs.get('title'),
-        'title': title,
-        'description': s.find(class_='submission-description').text.strip().replace('\r\n', '\n'),
+        "id": int(path.split("/")[-2:-1][0]),
+        "filename": filename,
+        "author": s.find(class_="submission-id-sub-container")
+        .find("a")
+        .find("strong")
+        .text,
+        "date": s.find(class_="popup_date").attrs.get("title"),
+        "title": title,
+        "description": s.find(class_="submission-description")
+        .text.strip()
+        .replace("\r\n", "\n"),
        "tags": [],
-        'category': s.find(class_='info').find(class_='category-name').text,
-        'type': s.find(class_='info').find(class_='type-name').text,
-        'species': s.find(class_='info').findAll('div')[2].find('span').text,
-        'gender': s.find(class_='info').findAll('div')[3].find('span').text,
-        'views': int(s.find(class_='views').find(class_='font-large').text),
-        'favorites': int(s.find(class_='favorites').find(class_='font-large').text),
-        'rating': s.find(class_='rating-box').text.strip(),
-        'comments': []
+        "category": s.find(class_="info").find(class_="category-name").text,
+        "type": s.find(class_="info").find(class_="type-name").text,
+        "species": s.find(class_="info").findAll("div")[2].find("span").text,
+        "gender": s.find(class_="info").findAll("div")[3].find("span").text,
+        "views": int(s.find(class_="views").find(class_="font-large").text),
+        "favorites": int(s.find(class_="favorites").find(class_="font-large").text),
+        "rating": s.find(class_="rating-box").text.strip(),
+        "comments": [],
    }

+    match = re.search(
+        "reminder|auction|YCH Open|YCH Closed|Adopt|pre-order", title, re.IGNORECASE
+    )
+    if match is not None and title == match.string:
+        print(f"post {title} was filtered and will not be downloaded")
+        return True
+
    # Extact tags
    try:
-        for tag in s.find(class_='tags-row').findAll(class_='tags'):
-            data['tags'].append(tag.find('a').text)
-    except:
-        pass
+        for tag in s.find(class_="tags-row").findAll(class_="tags"):
+            data["tags"].append(tag.find("a").text)
+    except AttributeError:
+        print(f'post: "{title}", has no tags')

-    # Extract comments
-    for comment in s.findAll(class_='comment_container'):
-        temp_ele = comment.find(class_='comment-parent')
-        parent_cid = None if temp_ele is None else int(temp_ele.attrs.get('href')[5:])
+    image_url = f"https:{image}"
+    os.makedirs(output, exist_ok=True)
+    output_path = f"{output}/{filename}"
+    if args.rating is True:
+        os.makedirs(f'{output}/{data.get("rating")}', exist_ok=True)
+        output_path = f'{output}/{data.get("rating")}/{filename}'
+    if args.folder is not None:
+        os.makedirs(f"{output}/{args.folder}", exist_ok=True)
+        output_path = f"{output}/{args.folder}/{filename}"
+        if args.rating is True:
+            os.makedirs(f'{output}/{args.folder}/{data.get("rating")}', exist_ok=True)
+            output_path = f'{output}/{args.folder}/{data.get("rating")}/{filename}'

-        # Comment is deleted or hidden
-        if comment.find(class_='comment-link') is None:
-            continue
-
-        data['comments'].append({
-            'cid': int(comment.find(class_='comment-link').attrs.get('href')[5:]),
-            'parent_cid': parent_cid,
-            'content': comment.find(class_='comment_text').contents[0].strip(),
-            'username': comment.find(class_='comment_username').text,
-            'date': comment.find(class_='popup_date').attrs.get('title')
-        })
-
-    url = 'https:{}'.format(image)
-    output_path = os.path.join(args.output, filename)
-
-    if not args.dont_redownload or not os.path.isfile(output_path):
-        if not download_file(url, output_path, data["title"]):
-            return False
+    if args.dont_redownload is True and os.path.isfile(output_path):
+        print(f'Skipping "{title}", since it\'s already downloaded')
    else:
-        print('Skipping "{}", since it\'s already downloaded'.format(data["title"]))
-        return True
+        download_file(image_url, output_path, title)

-    # Write a UTF-8 encoded JSON file for metadata
-    with open(os.path.join(args.metadir, '{}.json'.format(filename)), 'w', encoding='utf-8') as f:
-        json.dump(data, f, ensure_ascii=False, indent=4)
+    if args.metadata is True:
+        # Extract comments
+        for comment in s.findAll(class_="comment_container"):
+            temp_ele = comment.find(class_="comment-parent")
+            parent_cid = (
+                None if temp_ele is None else int(temp_ele.attrs.get("href")[5:])
+            )
+            # Comment is deleted or hidden
+            if comment.find(class_="comment-link") is None:
+                continue
+
+            data["comments"].append(
+                {
+                    "cid": int(
+                        comment.find(class_="comment-link").attrs.get("href")[5:]
+                    ),
+                    "parent_cid": parent_cid,
+                    "content": comment.find(class_="comment_text").contents[0].strip(),
+                    "username": comment.find(class_="comment_username").text,
+                    "date": comment.find(class_="popup_date").attrs.get("title"),
+                }
+            )
+
+        # Write a UTF-8 encoded JSON file for metadata
+        os.makedirs(metadata, exist_ok=True)
+        with open(
+            os.path.join(metadata, f"{filename}.json"), "w", encoding="utf-8"
+        ) as f:
+            json.dump(data, f, ensure_ascii=False, indent=4)

    return True

-global i
-i = 1
+
 # Main downloading loop
-while True:
-    
-    if args.stop and args.stop == page_num:
-        print(f"Reached page {args.stop}, stopping.")
-        break
-
-    page_url = '{}/{}'.format(gallery_url, page_num)
+def main():
+    page_end = args.stop
+    page_num = args.start
+    page_url = f"{download_url}/{page_num}"
    response = session.get(page_url)
-    s = BeautifulSoup(response.text, 'html.parser')
-
-    # Account status
-    if page_num == 1:
-        if s.find(class_='loggedin_user_avatar') is not None:
-            account_username = s.find(class_='loggedin_user_avatar').attrs.get('alt')
-            print('Logged in as', account_username)
-        else:
-            print('Not logged in, NSFW content is inaccessible')
-
-    # System messages
-    if s.find(class_='notice-message') is not None:
-        message = s.find(class_='notice-message').find('div').find(class_="link-override").text.strip()
-        raise Exception('System Message', message)
-
-    # End of gallery
-    if s.find(id='no-images') is not None:
-        print('End of gallery')
-        break
-
-    # Download all images on the page
-    for img in s.findAll('figure'):
-        download(img.find('a').attrs.get('href'))
-        sleep(args.interval)
-        
-    if args.category == "msg":
-        next_button = s.find('a', class_='button standard more', text="Next 48")
-        
-        if next_button is None or next_button.parent is None:
-            next_button = s.find('a', class_='button standard more-half', text="Next 48") 
-            if next_button is None or next_button.parent is None:
-                print('Unable to find next button')
-                break
-        
-        next_page_link = next_button.attrs['href']
-
-        i = i + 1
-        page_num = next_page_link.split('/')[-2]
-        page_url = base_url + next_page_link 
-        
-        print('Downloading page', i, page_url)
-    elif args.category != "favorites":
-        next_button = s.find('button', class_='button standard', text="Next")
-        if next_button is None or next_button.parent is None:
-            print('Unable to find next button')
-            break
-
-        page_num = next_button.parent.attrs['action'].split('/')[-2]
-
-        print('Downloading page', page_num, page_url)
+    s = BeautifulSoup(response.text, "html.parser")
+    if s.find(class_="loggedin_user_avatar") is not None:
+        account_username = s.find(class_="loggedin_user_avatar").attrs.get("alt")
+        print("Logged in as", account_username)
    else:
-        next_button = s.find('a', class_='button mobile-button right', text="Next")
-        if next_button is None:
-            print('Unable to find next button')
+        print("Not logged in, NSFW content is inaccessible")
+
+    while True:
+        if page_end == page_num:
+            print(f"Reached page {page_end}, stopping.")
            break

-        # unlike galleries that are sequentially numbered, favorites use a different scheme.
-        # the "page_num" is instead: [set of numbers]/next (the trailing /next is required)
-        
-        next_page_link = next_button.attrs['href']
-        next_fav_num = re.search(r'\d+', next_page_link)
+        page_url = f"{download_url}/{page_num}"
+        response = session.get(page_url)
+        s = BeautifulSoup(response.text, "html.parser")

-        if next_fav_num == None:
-            print('Failed to parse next favorite link.')
+        # System messages
+        if s.find(class_="notice-message") is not None:
+            try:
+                message = (
+                    s.find(class_="notice-message")
+                    .find("div")
+                    .find(class_="link-override")
+                    .text.strip()
+                )
+            except AttributeError:
+                print("You didn't provide cookies to log in")
+                break
+            raise Exception("System Message", message)
+
+        # End of gallery
+        if s.find(id="no-images") is not None:
+            print("End of gallery")
            break

-        page_num = next_fav_num.group(0) + "/next"
+        # Download all images on the page
+        for img in s.findAll("figure"):
+            download(img.find("a").attrs.get("href"))
+            sleep(args.interval)

-        # parse it into numbers/next
+        if args.submissions is True:
+            next_button = s.find("a", class_="button standard more", text="Next 48")
+            if next_button is None or next_button.parent is None:
+                next_button = s.find(
+                    "a", class_="button standard more-half", text="Next 48"
+                )
+                if next_button is None or next_button.parent is None:
+                    print("Unable to find next button")
+                    break
+
+            next_page_link = next_button.attrs["href"]
+            page_num = next_page_link.split("/")[-2]
+            page_url = BASE_URL + next_page_link
+
+            print("Downloading page", page_num, page_url)
+        elif args.category != "favorites":
+            next_button = s.find("button", class_="button standard", text="Next")
+            if next_button is None or next_button.parent is None:
+                print("Unable to find next button")
+                break
+
+            page_num = next_button.parent.attrs["action"].split("/")[-2]
+
+            print("Downloading page", page_num, page_url)
+        else:
+            next_button = s.find("a", class_="button mobile-button right", text="Next")
+            if next_button is None:
+                print("Unable to find next button")
+                break
+
+            # unlike galleries that are sequentially numbered, favorites use a different scheme.
+            # the "page_num" is instead: [set of numbers]/next (the trailing /next is required)
+
+            next_page_link = next_button.attrs["href"]
+            next_fav_num = re.search(r"\d+", next_page_link)
+
+            if next_fav_num is None:
+                print("Failed to parse next favorite link.")
+                break
+
+            page_num = next_fav_num.group(0) + "/next"
+
+            # parse it into numbers/next
+
+            print("Downloading page", page_num, page_url)
+
+    print("Finished downloading")


-        print('Downloading page', page_num, page_url)
-
-
-print('Finished downloading')
+if __name__ == "__main__":
+    main()