# -*- coding: utf-8 -*- import urllib import urlparse import os import re import sys class url_finder: def __init__(self, url): html = self.get_html(sys.argv[1]) # url trimming url_parsed = urlparse.urlparse(url) self.url = '%s://%s' % (url_parsed[0], url_parsed[1]) # exclude file name spl = url_parsed[2].split('/') if len(spl) > 1: self.url += '/'.join(spl[0:-1]) # links, and images self.links = self.get_all_external_links(html) self.save_all_images(html) def get_all_external_links(self, html): found = re.findall(r'