Save unfinished work.

Add summary function.
Explicitly declare parser.
2020-07-29 22:21:57 +02:00 · 2020-07-29 22:21:57 +02:00 · 2020-07-29 22:21:57 +02:00 · 2020-07-29 22:21:57 +02:00 · 2020-07-29 22:21:57 +02:00 · 2020-07-29 22:21:57 +02:00
1 changed files with 67 additions and 0 deletions
--- a/main.py
+++ b/main.py
@ -0,0 +1,67 @@
+import httplib2
+import re
+import argparse
+import os.path
+from collections import Counter
+from bs4 import BeautifulSoup
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description="Dig out links from a website.")
+    parser.add_argument('site', type=str, help="Website that you want to scrape for links.")
+    parser.add_argument('-f', '--filter', type=str, default=r'..*$',
+                        help="Only return file names matching this regular expression.")
+    parser.add_argument('-s', '--summary', action='store_true', help="Print summary and quit.")
+    parser.add_argument('-d', '--debug', action='store_true', help="Print out debugging information.")
+    parser.add_argument('-r', '--save-report', action='store_true',
+                        help='Save report of all the original links to a file.')
+    
+    args = parser.parse_args()
+    
+    # todo: save in tmp
+    h = httplib2.Http('.cache')
+    response, content = h.request(args.site)
+    
+    s = BeautifulSoup(content, features='html.parser')
+    
+    # if args.debug:
+    #     print(s.prettify())
+
+    links = s.find_all(href=re.compile(args.filter))
+    if not links or links == 0:
+        exit("No file names found with the given filter.")
+    if args.summary:
+        # grab only the extension from splitext
+        extensions = []
+        has_jpeg = False
+        for link in links:
+            filename, extension = os.path.splitext(link['href'])
+            if extension:
+                if extension == '.jpeg':
+                    has_jpeg = True
+                extensions.append(extension)
+            if args.debug:
+                print(f'Original: {link}, splitext[0]: {filename}, splitext[1]: {extension}')
+        # todo: merge jpeg and jpg into one extension (display as: xxx jpg (yyy as jpeg))
+        print(f'URL: {args.site}')
+        print(f'Total file names: {len(links)}')
+        print('File types:')
+        # todo: in the counts of extensions mention varying capitalisation
+        extension_count = Counter(extensions)
+        if has_jpeg:
+            
+        for extension, count in extension_count.items():
+            # do not print the first character (dot) of extension
+            if not has_jpeg:
+                print(f'{count}\t{extension[1:]}')
+            else:
+                if extension == '.jpg' or extension == '.jpeg':
+                    print(f"{count}\t{extension[1:]} (with some as .jpeg")
+        exit(0)
+    # we got to the end, print out all the links
+    link_list = [f'{args.site}{link["href"]}' for link in links]
+    for link in link_list:
+        print(f"{link}")
+    if args.save_report:
+        with open(args.save_report, 'w') as file:
+            # todo: create summary object that will be printed with --summary or saved to file with --save-summary
+            print(link_list, file=file)
Author	SHA1	Message	Date
Lukáš Kucharczyk	1ffe83e493	Save unfinished work.	2020-07-29 22:21:57 +02:00
Lukáš Kucharczyk	914498121c	Add summary function.	2020-07-29 22:21:57 +02:00
Lukáš Kucharczyk	b3a74584e9	Explicitly declare parser.	2020-07-29 22:21:57 +02:00
Lukáš Kucharczyk	9864ad2617	Add the filter argument.	2020-07-29 22:21:57 +02:00
Lukáš Kucharczyk	2ab1331c92	Add support for specifying website.	2020-07-29 22:21:57 +02:00
Lukáš Kucharczyk	08e4cb4630	Initial version.	2020-07-29 22:21:57 +02:00