jmelesky
/
pdorg_site


			
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768
							#!/usr/bin/env python

from multiprocessing import Process, Queue
from HTMLParser import HTMLParser
from urlparse import urlparse
from urllib2 import urlopen
from os import getpid

class LinkParser(HTMLParser):
    def __init__(self, site, *args, **kwargs):
        HTMLParser.__init__(self)
        self.site = site
        self.links = []

    def reset(self):
        HTMLParser.reset(self)
        self.site = ''
        self.links = []

    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)
        if tag.lower() == 'a':
            if 'href' in attrs:
                href = attrs['href']
                if href.lower().startswith('http:'):
                    self.links.append(href)
                elif href.startswith('/') and '..' not in href:
                    self.links.append('http://' + self.site + href)

def parser(q):
    parser = LinkParser('')

    stillgoing = True
    while stillgoing:
        (url, depth) = (None, None)
        try:
            (url, depth) = q.get(True, 5) # 5 seconds should be plenty
        except:
            stillgoing = False
            continue

        if depth < 2:
            print "[%s] getting %s" % (getpid(), url)
            parser.site = urlparse(url).netloc
            parser.feed(urlopen(url).read())
            for link in parser.links:
                q.put((link, depth+1))
            parser.reset()
        else:
            print "[%s] won't get %s -- too deep" % (getpid(), url)


if __name__ == '__main__':
    q = Queue()

    workers = [Process(target=parser, args=(q,), name="worker %s" % x)
               for x in range(10)]

    for worker in workers:
        worker.start()

    q.put(("http://reddit.com/",0))