#!/usr/bin/env python3 from lxml import html import requests import re import time import pprint import pickle def scrape_featlist(url): #r = requests.get(url) #featspage = html.fromstring(r.content) featspage = html.parse('/home/jmelesky/code/featscraper/assets/feats.html') urls = [] for featanchor in featspage.xpath('//table[@id="ctl00_MainContent_GridView6"]/tr/td[1]/a'): urls.append(featanchor.attrib['href']) return urls def sanitext(text): rettext = re.sub(r'^:\s+', '', text) rettext = re.sub(r'\.$', '', rettext) rettext = re.sub(r'â\x80\x93', '-', rettext) rettext = re.sub(r'â\x80\x99', "'", rettext) rettext = re.sub(r'ACG', '', rettext) rettext = re.sub(r'APG', '', rettext) rettext = re.sub(r'ARG', '', rettext) rettext = re.sub(r'ISWG', '', rettext) rettext = re.sub(r'OA', '', rettext) rettext = re.sub(r'UC', '', rettext) rettext = re.sub(r'UI', '', rettext) rettext = re.sub(r'\(see the Pathfinder RPG Advanced Player\'s Guide\)', '', rettext) return rettext.strip() def addreq(reqset, t, req): if t in reqset: if not isinstance(reqset[t], list): reqset[t] = [reqset[t]] reqset[t].append(req) else: reqset[t] = [req] return reqset def parse_prereqs(reqtext, sep=r'[,;]'): reqs = {} for req in [sanitext(x).lower() for x in re.split(sep, reqtext)]: if req.find(' or ') > -1: reqs = addreq(reqs, 'or', parse_prereqs(req, sep=r'\Wor\W')) elif req.find(' and ') > -1: reqs = addreq(reqs, 'and', parse_prereqs(req, sep=r'\Wand\W')) else: m = re.search(r'^(\w{3}) (\d{2})$', req) if m: reqs = addreq(reqs, m.group(1), m.group(2)) else: m = re.search(r'^base\s+attack\s+bonus\s+\+{0,1}(\d+)$', req) if m: reqs = addreq(reqs, 'bab', m.group(1)) else: m = re.search(r'^(.*?)\Wlevel (\d+)', req) if m: reqs = addreq(reqs, 'level', (m.group(1), m.group(2))) else: m = re.search(r'^(\d+).*level (.*)$', req) if m: reqs = addreq(reqs, 'level', (m.group(2), m.group(1))) else: m = re.search(r'^(.*?) (\d+) rank', req) if m: reqs = addreq(reqs, 'skill', (m.group(1), m.group(2))) else: reqs = addreq(reqs, 'feat', req) return reqs def scrape_feat(url): r = requests.get(url) featpage = html.fromstring(r.content) #featpage = html.parse('/home/jmelesky/code/featscraper/assets/feat-ald.html') feat = {'prereqs': {}, 'benefit': '', 'special': '', 'trick': '', } feattexts = [x for x in featpage.xpath('//span[@id="ctl00_MainContent_DataListTypes_ctl00_LabelName"]')[0].itertext()] for i in range(0,len(feattexts)-1): t = feattexts[i] if i == 0: if t.find('Teamwork') < 0: feat['name'] = sanitext(re.sub(r'\(.*?\)', '', t)) else: break elif t == 'Benefit' and feat['benefit'] == '': feat['benefit'] = sanitext(feattexts[i+1]) elif t == 'Prerequisites' and feat['benefit'] == '': feat['prereqs'] = parse_prereqs(feattexts[i+1]) elif t == 'Special' and feat['special'] == '': feat['special'] = sanitext(feattexts[i+1]) elif t == 'Combat Trick': feat['trick'] = sanitext(feattexts[i+7]) if feat['benefit']: return feat else: return None def scrape_feats(base, url): urls = scrape_featlist(base + url) feats = [] for url in urls: print(url) feat = scrape_feat(base + url) if feat: feats.append(feat) time.sleep(.3) with open('feats.pickle', 'wb') as f: pickle.dump(feats, f) if __name__ == '__main__': scrape_feats('http://www.archivesofnethys.com/', 'Feats.aspx?Category=Combat') # /html/body/div/form/div[3]/div[2]/div[2]/div/table/tbody/tr[2]/td[1] # #ctl00_MainContent_GridView6 > tbody:nth-child(1) > tr:nth-child(2) > td:nth-child(1) # html body.dark div#wrapper.clearfix form#aspnetForm div#page.page.clearfix div#main-wrapper.main-wrapper div#main.main div table#ctl00_MainContent_GridView6 tbody tr td # /html/body/div[2]/div # body > div:nth-child(2) > div:nth-child(1)