#!/usr/bin/env python from multiprocessing import Process, Queue from HTMLParser import HTMLParser from urlparse import urlparse from urllib2 import urlopen from os import getpid class LinkParser(HTMLParser): def __init__(self, site, *args, **kwargs): HTMLParser.__init__(self) self.site = site self.links = [] def reset(self): HTMLParser.reset(self) self.site = '' self.links = [] def handle_starttag(self, tag, attrs): attrs = dict(attrs) if tag.lower() == 'a': if 'href' in attrs: href = attrs['href'] if href.lower().startswith('http:'): self.links.append(href) elif href.startswith('/') and '..' not in href: self.links.append('http://' + self.site + href) def parser(q): parser = LinkParser('') stillgoing = True while stillgoing: (url, depth) = (None, None) try: (url, depth) = q.get(True, 5) # 5 seconds should be plenty except: stillgoing = False continue if depth < 2: print "[%s] getting %s" % (getpid(), url) parser.site = urlparse(url).netloc parser.feed(urlopen(url).read()) for link in parser.links: q.put((link, depth+1)) parser.reset() else: print "[%s] won't get %s -- too deep" % (getpid(), url) if __name__ == '__main__': q = Queue() workers = [Process(target=parser, args=(q,), name="worker %s" % x) for x in range(10)] for worker in workers: worker.start() q.put(("http://reddit.com/",0))