scraper.py 1.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768
  1. #!/usr/bin/env python
  2. from multiprocessing import Process, Queue
  3. from HTMLParser import HTMLParser
  4. from urlparse import urlparse
  5. from urllib2 import urlopen
  6. from os import getpid
  7. class LinkParser(HTMLParser):
  8. def __init__(self, site, *args, **kwargs):
  9. HTMLParser.__init__(self)
  10. self.site = site
  11. self.links = []
  12. def reset(self):
  13. HTMLParser.reset(self)
  14. self.site = ''
  15. self.links = []
  16. def handle_starttag(self, tag, attrs):
  17. attrs = dict(attrs)
  18. if tag.lower() == 'a':
  19. if 'href' in attrs:
  20. href = attrs['href']
  21. if href.lower().startswith('http:'):
  22. self.links.append(href)
  23. elif href.startswith('/') and '..' not in href:
  24. self.links.append('http://' + self.site + href)
  25. def parser(q):
  26. parser = LinkParser('')
  27. stillgoing = True
  28. while stillgoing:
  29. (url, depth) = (None, None)
  30. try:
  31. (url, depth) = q.get(True, 5) # 5 seconds should be plenty
  32. except:
  33. stillgoing = False
  34. continue
  35. if depth < 2:
  36. print "[%s] getting %s" % (getpid(), url)
  37. parser.site = urlparse(url).netloc
  38. parser.feed(urlopen(url).read())
  39. for link in parser.links:
  40. q.put((link, depth+1))
  41. parser.reset()
  42. else:
  43. print "[%s] won't get %s -- too deep" % (getpid(), url)
  44. if __name__ == '__main__':
  45. q = Queue()
  46. workers = [Process(target=parser, args=(q,), name="worker %s" % x)
  47. for x in range(10)]
  48. for worker in workers:
  49. worker.start()
  50. q.put(("http://reddit.com/",0))