1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768 |
- #!/usr/bin/env python
- from multiprocessing import Process, Queue
- from HTMLParser import HTMLParser
- from urlparse import urlparse
- from urllib2 import urlopen
- from os import getpid
- class LinkParser(HTMLParser):
- def __init__(self, site, *args, **kwargs):
- HTMLParser.__init__(self)
- self.site = site
- self.links = []
- def reset(self):
- HTMLParser.reset(self)
- self.site = ''
- self.links = []
- def handle_starttag(self, tag, attrs):
- attrs = dict(attrs)
- if tag.lower() == 'a':
- if 'href' in attrs:
- href = attrs['href']
- if href.lower().startswith('http:'):
- self.links.append(href)
- elif href.startswith('/') and '..' not in href:
- self.links.append('http://' + self.site + href)
- def parser(q):
- parser = LinkParser('')
- stillgoing = True
- while stillgoing:
- (url, depth) = (None, None)
- try:
- (url, depth) = q.get(True, 5) # 5 seconds should be plenty
- except:
- stillgoing = False
- continue
- if depth < 2:
- print "[%s] getting %s" % (getpid(), url)
- parser.site = urlparse(url).netloc
- parser.feed(urlopen(url).read())
- for link in parser.links:
- q.put((link, depth+1))
- parser.reset()
- else:
- print "[%s] won't get %s -- too deep" % (getpid(), url)
- if __name__ == '__main__':
- q = Queue()
- workers = [Process(target=parser, args=(q,), name="worker %s" % x)
- for x in range(10)]
- for worker in workers:
- worker.start()
- q.put(("http://reddit.com/",0))
|