shovel/main.py

import httplib2
import re
import argparse
import os.path
from collections import Counter
from bs4 import BeautifulSoup

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Dig out links from a website.")
    parser.add_argument('site', type=str, help="Website that you want to scrape for links.")
    parser.add_argument('-f', '--filter', type=str, default=r'..*$',
                        help="Only return file names matching this regular expression.")
    parser.add_argument('-s', '--summary', action='store_true', help="Print summary and quit.")
    parser.add_argument('-d', '--debug', action='store_true', help="Print out debugging information.")
    parser.add_argument('-r', '--save-report', action='store_true',
                        help='Save report of all the original links to a file.')
    
    args = parser.parse_args()
    
    # todo: save in tmp
    h = httplib2.Http('.cache')
    response, content = h.request(args.site)
    
    s = BeautifulSoup(content, features='html.parser')
    
    # if args.debug:
    #     print(s.prettify())

    links = s.find_all(href=re.compile(args.filter))
    if not links or links == 0:
        exit("No file names found with the given filter.")
    if args.summary:
        # grab only the extension from splitext
        extensions = []
        has_jpeg = False
        for link in links:
            filename, extension = os.path.splitext(link['href'])
            if extension:
                if extension == '.jpeg':
                    has_jpeg = True
                extensions.append(extension)
            if args.debug:
                print(f'Original: {link}, splitext[0]: {filename}, splitext[1]: {extension}')
        # todo: merge jpeg and jpg into one extension (display as: xxx jpg (yyy as jpeg))
        print(f'URL: {args.site}')
        print(f'Total file names: {len(links)}')
        print('File types:')
        # todo: in the counts of extensions mention varying capitalisation
        extension_count = Counter(extensions)
        if has_jpeg:
            
        for extension, count in extension_count.items():
            # do not print the first character (dot) of extension
            if not has_jpeg:
                print(f'{count}\t{extension[1:]}')
            else:
                if extension == '.jpg' or extension == '.jpeg':
                    print(f"{count}\t{extension[1:]} (with some as .jpeg")
        exit(0)
    # we got to the end, print out all the links
    link_list = [f'{args.site}{link["href"]}' for link in links]
    for link in link_list:
        print(f"{link}")
    if args.save_report:
        with open(args.save_report, 'w') as file:
            # todo: create summary object that will be printed with --summary or saved to file with --save-summary
            print(link_list, file=file)
Initial version. 2020-06-28 19:11:24 +00:00			`import httplib2`
			`import re`
Add support for specifying website. 2020-06-28 19:20:52 +00:00			`import argparse`
Add summary function. 2020-06-28 20:28:06 +00:00			`import os.path`
			`from collections import Counter`
Initial version. 2020-06-28 19:11:24 +00:00			`from bs4 import BeautifulSoup`

			`if __name__ == '__main__':`
Add support for specifying website. 2020-06-28 19:20:52 +00:00			`parser = argparse.ArgumentParser(description="Dig out links from a website.")`
			`parser.add_argument('site', type=str, help="Website that you want to scrape for links.")`
Save unfinished work. 2020-07-29 20:21:53 +00:00			`parser.add_argument('-f', '--filter', type=str, default=r'..*$',`
			`help="Only return file names matching this regular expression.")`
Add summary function. 2020-06-28 20:28:06 +00:00			`parser.add_argument('-s', '--summary', action='store_true', help="Print summary and quit.")`
Save unfinished work. 2020-07-29 20:21:53 +00:00			`parser.add_argument('-d', '--debug', action='store_true', help="Print out debugging information.")`
			`parser.add_argument('-r', '--save-report', action='store_true',`
			`help='Save report of all the original links to a file.')`

Add support for specifying website. 2020-06-28 19:20:52 +00:00			`args = parser.parse_args()`

Save unfinished work. 2020-07-29 20:21:53 +00:00			`# todo: save in tmp`
Initial version. 2020-06-28 19:11:24 +00:00			`h = httplib2.Http('.cache')`
Add support for specifying website. 2020-06-28 19:20:52 +00:00			`response, content = h.request(args.site)`
Explicitly declare parser. 2020-06-28 19:46:55 +00:00
Save unfinished work. 2020-07-29 20:21:53 +00:00			`s = BeautifulSoup(content, features='html.parser')`
Explicitly declare parser. 2020-06-28 19:46:55 +00:00
Save unfinished work. 2020-07-29 20:21:53 +00:00			`# if args.debug:`
			`# print(s.prettify())`

Add the filter argument. 2020-06-28 19:45:14 +00:00			`links = s.find_all(href=re.compile(args.filter))`
			`if not links or links == 0:`
Save unfinished work. 2020-07-29 20:21:53 +00:00			`exit("No file names found with the given filter.")`
Add summary function. 2020-06-28 20:28:06 +00:00			`if args.summary:`
Save unfinished work. 2020-07-29 20:21:53 +00:00			`# grab only the extension from splitext`
			`extensions = []`
			`has_jpeg = False`
			`for link in links:`
			`filename, extension = os.path.splitext(link['href'])`
			`if extension:`
			`if extension == '.jpeg':`
			`has_jpeg = True`
			`extensions.append(extension)`
			`if args.debug:`
			`print(f'Original: {link}, splitext[0]: {filename}, splitext[1]: {extension}')`
			`# todo: merge jpeg and jpg into one extension (display as: xxx jpg (yyy as jpeg))`
Add summary function. 2020-06-28 20:28:06 +00:00			`print(f'URL: {args.site}')`
Save unfinished work. 2020-07-29 20:21:53 +00:00			`print(f'Total file names: {len(links)}')`
Add summary function. 2020-06-28 20:28:06 +00:00			`print('File types:')`
Save unfinished work. 2020-07-29 20:21:53 +00:00			`# todo: in the counts of extensions mention varying capitalisation`
			`extension_count = Counter(extensions)`
			`if has_jpeg:`

			`for extension, count in extension_count.items():`
			`# do not print the first character (dot) of extension`
			`if not has_jpeg:`
			`print(f'{count}\t{extension[1:]}')`
			`else:`
			`if extension == '.jpg' or extension == '.jpeg':`
			`print(f"{count}\t{extension[1:]} (with some as .jpeg")`
Add summary function. 2020-06-28 20:28:06 +00:00			`exit(0)`
Save unfinished work. 2020-07-29 20:21:53 +00:00			`# we got to the end, print out all the links`
			`link_list = [f'{args.site}{link["href"]}' for link in links]`
			`for link in link_list:`
			`print(f"{link}")`
			`if args.save_report:`
			`with open(args.save_report, 'w') as file:`
			`# todo: create summary object that will be printed with --summary or saved to file with --save-summary`
			`print(link_list, file=file)`