#!/usr/bin/env python3
from lxml import html
import requests
import re
import time
import pprint
import pickle
def scrape_featlist(url):
r = requests.get(url)
featspage = html.fromstring(r.content)
#featspage = html.parse('/home/jmelesky/code/featscraper/assets/feats.html')
urls = []
# for pages with subpages list
for featanchor in featspage.xpath("//ul[@class='ogn-childpages']/li/a"):
featurl = featanchor.attrib['href']
if (featurl.startswith('http:') and 'feats' in featurl
and not featurl.endswith('teamwork')
and not featurl.endswith('teamwork/')):
urls.append(featurl)
elif (not featurl.startswith('http:')
and not featurl.endswith('teamwork')
and not featurl.endswith('teamwork/')):
urls.append(url + '/' + featurl)
return urls
def sanitext(text):
rettext = re.sub(r'^:\s+', '', text)
rettext = re.sub(r'\.$', '', rettext)
rettext = re.sub(r'\r', '', rettext)
rettext = re.sub(r'â\x80\x93', '-', rettext)
rettext = re.sub(r'â\x80\x99', "'", rettext)
rettext = re.sub(r'â', '\textemdash', rettext)
rettext = re.sub(r'"', "''", rettext)
rettext = re.sub(r'’', "'", rettext)
rettext = re.sub(r'%', "\%", rettext)
rettext = re.sub(r'—', " -- ", rettext)
rettext = re.sub(r'–', "-", rettext)
rettext = re.sub(r'ACG', '', rettext)
rettext = re.sub(r'APG', '', rettext)
rettext = re.sub(r'ARG', '', rettext)
rettext = re.sub(r'ISWG', '', rettext)
rettext = re.sub(r'OA', '', rettext)
rettext = re.sub(r'UC', '', rettext)
rettext = re.sub(r'UI', '', rettext)
rettext = re.sub(r'\(see the Pathfinder RPG Advanced Player\'s Guide\)', '', rettext)
return rettext.strip()
def addreq(reqset, t, req):
if t in reqset:
if not isinstance(reqset[t], list):
reqset[t] = [reqset[t]]
reqset[t].append(req)
else:
reqset[t] = [req]
return reqset
def chomp(text):
return text.split(' ', 1)[1]
def parse_prereqs(reqtext, sep=r'[,;]'):
reqs = {}
for req in [sanitext(x).lower() for x in re.split(sep, reqtext)]:
if req.find(' or ') > -1:
reqs = addreq(reqs, 'or', parse_prereqs(req, sep=r'\Wor\W'))
elif req.find(' and ') > -1:
reqs = addreq(reqs, 'and', parse_prereqs(req, sep=r'\Wand\W'))
else:
m = re.search(r'^(\w{3}) (\d{1,2})\.?$', req)
if m:
reqs = addreq(reqs, m.group(1), m.group(2))
else:
m = re.search(r'^dexterity (\d+)', req)
if m:
reqs = addreq(reqs, 'dex', m.group(1))
else:
m = re.search(r'^strength (\d+)', req)
if m:
reqs = addreq(reqs, 'str', m.group(1))
else:
m = re.search(r'^base\W+attack\W+bonus\W+\+{0,1}(\d+)', req)
if m:
reqs = addreq(reqs, 'bab', m.group(1))
else:
m = re.search(r'^(.*?)\Wlevel (\d+)', req)
if m:
reqs = addreq(reqs, 'level', (m.group(1), m.group(2)))
else:
m = re.search(r'^(\d+).*level (.*)$', req)
if m:
reqs = addreq(reqs, 'level', (m.group(2), m.group(1)))
else:
m = re.search(r'^(.*?) (\d+) rank', req)
if m:
reqs = addreq(reqs, 'skill', (m.group(1), m.group(2)))
else:
reqs = addreq(reqs, 'feat', req)
return reqs
def scrape_feat(url):
r = requests.get(url)
featpage = html.fromstring(r.content)
#featpage = html.parse('/home/jmelesky/code/featscraper/assets/feat-ald.html')
feat = {'prereqs': {},
'benefit': '',
'special': '',
'trick': '',
}
nametext = featpage.xpath('//h1')[0].text_content()
if nametext.find('Teamwork') < 0:
feat['name'] = sanitext(re.sub(r'\(.*?\)', '', nametext))
else:
return None
feattexts = [x for x in
featpage.xpath('//div[@class="article-content"]/p')]
for i in feattexts:
t = i.text_content()
if t.startswith('Benefit') and feat['benefit'] == '':
feat['benefit'] = sanitext(chomp(t))
elif t.startswith('Prerequisite') and feat['benefit'] == '':
feat['prereqs'] = parse_prereqs(chomp(t))
elif t.startswith('Special') and feat['special'] == '':
feat['special'] = sanitext(chomp(t))
elif t.startswith('Combat Trick'):
feat['trick'] = sanitext(chomp(t))
else:
if feat['benefit'] != '' and not t.startswith('Normal'):
feat['benefit'] += '\n\n'
feat['benefit'] += sanitext(t)
else:
print('>>> ' + t)
extralist = [x for x in
featpage.xpath('//div[@class="article-content"]/ul/li')]
for extra in extralist:
feat['benefit'] += '\n\n'
feat['benefit'] += '- ' + sanitext(extra.text_content())
if feat['benefit']:
return feat
else:
return None
def scrape_feats(baseurls):
urls = []
for baseurl in baseurls:
urls += scrape_featlist(baseurl)
feats = []
for url in urls:
feat = scrape_feat(url)
if feat:
feats.append(feat)
time.sleep(.3)
with open('feats.pickle', 'wb') as f:
pickle.dump(feats, f)
if __name__ == '__main__':
scrape_feats(['http://www.d20pfsrd.com/feats/combat-feats/all-combat-feats',
'http://www.d20pfsrd.com/feats/armor-mastery-feats',
'http://www.d20pfsrd.com/feats/weapon-mastery-feats'])
# /html/body/div/form/div[3]/div[2]/div[2]/div/table/tbody/tr[2]/td[1]
# #ctl00_MainContent_GridView6 > tbody:nth-child(1) > tr:nth-child(2) > td:nth-child(1)
# html body.dark div#wrapper.clearfix form#aspnetForm div#page.page.clearfix div#main-wrapper.main-wrapper div#main.main div table#ctl00_MainContent_GridView6 tbody tr td
# /html/body/div[2]/div
# body > div:nth-child(2) > div:nth-child(1)