#!/usr/bin/env python3
from lxml import html
import requests
import re
import time
import pprint
import pickle
def scrape_featlist(url):
r = requests.get(url)
featspage = html.fromstring(r.content)
#featspage = html.parse('/home/jmelesky/code/featscraper/assets/feats.html')
urls = []
# for pages with tables
for featanchor in featspage.xpath('//table/tbody/tr/td[1]/a'):
featurl = featanchor.attrib['href']
if (featurl.startswith('http:') and 'feats' in featurl
and not featurl.endswith('teamwork')
and not featurl.endswith('teamwork/')):
urls.append(featurl)
elif (not featurl.startswith('http:')
and not featurl.endswith('teamwork')
and not featurl.endswith('teamwork/')):
urls.append(url + '/' + featurl)
# for pages with subpages list
for featanchor in featspage.xpath("//ul[@class='ogn-childpages']/li/a"):
featurl = featanchor.attrib['href']
if (featurl.startswith('http:') and 'feats' in featurl
and not featurl.endswith('teamwork')
and not featurl.endswith('teamwork/')):
urls.append(featurl)
elif (not featurl.startswith('http:')
and not featurl.endswith('teamwork')
and not featurl.endswith('teamwork/')):
urls.append(url + '/' + featurl)
return urls
def sanitext(text):
rettext = re.sub(r'^:\s+', '', text)
rettext = re.sub(r'\.$', '', rettext)
rettext = re.sub(r'\r', '', rettext)
rettext = re.sub(r'â\x80\x93', '-', rettext)
rettext = re.sub(r'â\x80\x99', "'", rettext)
rettext = re.sub(r'â', '\textemdash', rettext)
rettext = re.sub(r'"', "''", rettext)
rettext = re.sub(r'ACG', '', rettext)
rettext = re.sub(r'APG', '', rettext)
rettext = re.sub(r'ARG', '', rettext)
rettext = re.sub(r'ISWG', '', rettext)
rettext = re.sub(r'OA', '', rettext)
rettext = re.sub(r'UC', '', rettext)
rettext = re.sub(r'UI', '', rettext)
rettext = re.sub(r'\(see the Pathfinder RPG Advanced Player\'s Guide\)', '', rettext)
return rettext.strip()
def addreq(reqset, t, req):
if t in reqset:
if not isinstance(reqset[t], list):
reqset[t] = [reqset[t]]
reqset[t].append(req)
else:
reqset[t] = [req]
return reqset
def parse_prereqs(reqtext, sep=r'[,;]'):
reqs = {}
for req in [sanitext(x).lower() for x in re.split(sep, reqtext)]:
if req.find(' or ') > -1:
reqs = addreq(reqs, 'or', parse_prereqs(req, sep=r'\Wor\W'))
elif req.find(' and ') > -1:
reqs = addreq(reqs, 'and', parse_prereqs(req, sep=r'\Wand\W'))
else:
m = re.search(r'^(\w{3}) (\d{1,2})\.?$', req)
if m:
reqs = addreq(reqs, m.group(1), m.group(2))
else:
m = re.search(r'^dexterity (\d+)', req)
if m:
reqs = addreq(reqs, 'dex', m.group(1))
else:
m = re.search(r'^strength (\d+)', req)
if m:
reqs = addreq(reqs, 'str', m.group(1))
else:
m = re.search(r'^base\W+attack\W+bonus\W+\+{0,1}(\d+)', req)
if m:
reqs = addreq(reqs, 'bab', m.group(1))
else:
m = re.search(r'^(.*?)\Wlevel (\d+)', req)
if m:
reqs = addreq(reqs, 'level', (m.group(1), m.group(2)))
else:
m = re.search(r'^(\d+).*level (.*)$', req)
if m:
reqs = addreq(reqs, 'level', (m.group(2), m.group(1)))
else:
m = re.search(r'^(.*?) (\d+) rank', req)
if m:
reqs = addreq(reqs, 'skill', (m.group(1), m.group(2)))
else:
reqs = addreq(reqs, 'feat', req)
return reqs
def scrape_feat(url):
r = requests.get(url)
featpage = html.fromstring(r.content)
#featpage = html.parse('/home/jmelesky/code/featscraper/assets/feat-ald.html')
feat = {'prereqs': {},
'benefit': '',
'special': '',
'trick': '',
}
feattexts = [x for x in
featpage.xpath('//span[@id="ctl00_MainContent_DataListTypes_ctl00_LabelName"]')[0].itertext()]
for i in range(0,len(feattexts)-1):
t = feattexts[i]
if i == 0:
if t.find('Teamwork') < 0:
feat['name'] = sanitext(re.sub(r'\(.*?\)', '', t))
else:
break
elif t == 'Benefit' and feat['benefit'] == '':
feat['benefit'] = sanitext(feattexts[i+1])
elif t == 'Prerequisites' and feat['benefit'] == '':
feat['prereqs'] = parse_prereqs(feattexts[i+1])
elif t == 'Special' and feat['special'] == '':
feat['special'] = sanitext(feattexts[i+1])
elif t == 'Combat Trick':
feat['trick'] = sanitext(feattexts[i+7])
if feat['benefit']:
return feat
else:
return None
def scrape_feats(baseurls):
urls = []
for baseurl in baseurls:
urls += scrape_featlist(baseurl)
feats = []
for url in urls:
print(url)
feat = scrape_feat(url)
# if feat:
# feats.append(feat)
# time.sleep(.3)
# with open('feats.pickle', 'wb') as f:
# pickle.dump(feats, f)
if __name__ == '__main__':
scrape_feats(['http://www.d20pfsrd.com/feats/combat-feats',
'http://www.d20pfsrd.com/feats/armor-mastery-feats',
'http://www.d20pfsrd.com/feats/weapon-mastery-feats'])
# /html/body/div/form/div[3]/div[2]/div[2]/div/table/tbody/tr[2]/td[1]
# #ctl00_MainContent_GridView6 > tbody:nth-child(1) > tr:nth-child(2) > td:nth-child(1)
# html body.dark div#wrapper.clearfix form#aspnetForm div#page.page.clearfix div#main-wrapper.main-wrapper div#main.main div table#ctl00_MainContent_GridView6 tbody tr td
# /html/body/div[2]/div
# body > div:nth-child(2) > div:nth-child(1)