#!/usr/bin/env python3 from lxml import html import requests import re import time import pprint import pickle def scrape_featlist(url): r = requests.get(url) featspage = html.fromstring(r.content) #featspage = html.parse('/home/jmelesky/code/featscraper/assets/feats.html') urls = [] # for pages with tables for featanchor in featspage.xpath('//table/tbody/tr/td[1]/a'): featurl = featanchor.attrib['href'] if (featurl.startswith('http:') and 'feats' in featurl and not featurl.endswith('teamwork') and not featurl.endswith('teamwork/')): urls.append(featurl) elif (not featurl.startswith('http:') and not featurl.endswith('teamwork') and not featurl.endswith('teamwork/')): urls.append(url + '/' + featurl) # for pages with subpages list for featanchor in featspage.xpath("//ul[@class='ogn-childpages']/li/a"): featurl = featanchor.attrib['href'] if (featurl.startswith('http:') and 'feats' in featurl and not featurl.endswith('teamwork') and not featurl.endswith('teamwork/')): urls.append(featurl) elif (not featurl.startswith('http:') and not featurl.endswith('teamwork') and not featurl.endswith('teamwork/')): urls.append(url + '/' + featurl) return urls def sanitext(text): rettext = re.sub(r'^:\s+', '', text) rettext = re.sub(r'\.$', '', rettext) rettext = re.sub(r'\r', '', rettext) rettext = re.sub(r'â\x80\x93', '-', rettext) rettext = re.sub(r'â\x80\x99', "'", rettext) rettext = re.sub(r'—', '\textemdash', rettext) rettext = re.sub(r'"', "''", rettext) rettext = re.sub(r'ACG', '', rettext) rettext = re.sub(r'APG', '', rettext) rettext = re.sub(r'ARG', '', rettext) rettext = re.sub(r'ISWG', '', rettext) rettext = re.sub(r'OA', '', rettext) rettext = re.sub(r'UC', '', rettext) rettext = re.sub(r'UI', '', rettext) rettext = re.sub(r'\(see the Pathfinder RPG Advanced Player\'s Guide\)', '', rettext) return rettext.strip() def addreq(reqset, t, req): if t in reqset: if not isinstance(reqset[t], list): reqset[t] = [reqset[t]] reqset[t].append(req) else: reqset[t] = [req] return reqset def parse_prereqs(reqtext, sep=r'[,;]'): reqs = {} for req in [sanitext(x).lower() for x in re.split(sep, reqtext)]: if req.find(' or ') > -1: reqs = addreq(reqs, 'or', parse_prereqs(req, sep=r'\Wor\W')) elif req.find(' and ') > -1: reqs = addreq(reqs, 'and', parse_prereqs(req, sep=r'\Wand\W')) else: m = re.search(r'^(\w{3}) (\d{1,2})\.?$', req) if m: reqs = addreq(reqs, m.group(1), m.group(2)) else: m = re.search(r'^dexterity (\d+)', req) if m: reqs = addreq(reqs, 'dex', m.group(1)) else: m = re.search(r'^strength (\d+)', req) if m: reqs = addreq(reqs, 'str', m.group(1)) else: m = re.search(r'^base\W+attack\W+bonus\W+\+{0,1}(\d+)', req) if m: reqs = addreq(reqs, 'bab', m.group(1)) else: m = re.search(r'^(.*?)\Wlevel (\d+)', req) if m: reqs = addreq(reqs, 'level', (m.group(1), m.group(2))) else: m = re.search(r'^(\d+).*level (.*)$', req) if m: reqs = addreq(reqs, 'level', (m.group(2), m.group(1))) else: m = re.search(r'^(.*?) (\d+) rank', req) if m: reqs = addreq(reqs, 'skill', (m.group(1), m.group(2))) else: reqs = addreq(reqs, 'feat', req) return reqs def scrape_feat(url): r = requests.get(url) featpage = html.fromstring(r.content) #featpage = html.parse('/home/jmelesky/code/featscraper/assets/feat-ald.html') feat = {'prereqs': {}, 'benefit': '', 'special': '', 'trick': '', } feattexts = [x for x in featpage.xpath('//span[@id="ctl00_MainContent_DataListTypes_ctl00_LabelName"]')[0].itertext()] for i in range(0,len(feattexts)-1): t = feattexts[i] if i == 0: if t.find('Teamwork') < 0: feat['name'] = sanitext(re.sub(r'\(.*?\)', '', t)) else: break elif t == 'Benefit' and feat['benefit'] == '': feat['benefit'] = sanitext(feattexts[i+1]) elif t == 'Prerequisites' and feat['benefit'] == '': feat['prereqs'] = parse_prereqs(feattexts[i+1]) elif t == 'Special' and feat['special'] == '': feat['special'] = sanitext(feattexts[i+1]) elif t == 'Combat Trick': feat['trick'] = sanitext(feattexts[i+7]) if feat['benefit']: return feat else: return None def scrape_feats(baseurls): urls = [] for baseurl in baseurls: urls += scrape_featlist(baseurl) feats = [] for url in urls: print(url) feat = scrape_feat(url) # if feat: # feats.append(feat) # time.sleep(.3) # with open('feats.pickle', 'wb') as f: # pickle.dump(feats, f) if __name__ == '__main__': scrape_feats(['http://www.d20pfsrd.com/feats/combat-feats', 'http://www.d20pfsrd.com/feats/armor-mastery-feats', 'http://www.d20pfsrd.com/feats/weapon-mastery-feats']) # /html/body/div/form/div[3]/div[2]/div[2]/div/table/tbody/tr[2]/td[1] # #ctl00_MainContent_GridView6 > tbody:nth-child(1) > tr:nth-child(2) > td:nth-child(1) # html body.dark div#wrapper.clearfix form#aspnetForm div#page.page.clearfix div#main-wrapper.main-wrapper div#main.main div table#ctl00_MainContent_GridView6 tbody tr td # /html/body/div[2]/div # body > div:nth-child(2) > div:nth-child(1)