|
@@ -0,0 +1,145 @@
|
|
|
+#!/usr/bin/env python3
|
|
|
+
|
|
|
+from lxml import html
|
|
|
+import requests
|
|
|
+import re
|
|
|
+import time
|
|
|
+import pprint
|
|
|
+import pickle
|
|
|
+
|
|
|
+
|
|
|
+def scrape_featlist(url):
|
|
|
+ #r = requests.get(url)
|
|
|
+ #featspage = html.fromstring(r.content)
|
|
|
+ featspage = html.parse('/home/jmelesky/code/featscraper/assets/feats.html')
|
|
|
+
|
|
|
+ urls = []
|
|
|
+ for featanchor in featspage.xpath('//table[@id="ctl00_MainContent_GridView6"]/tr/td[1]/a'):
|
|
|
+ urls.append(featanchor.attrib['href'])
|
|
|
+
|
|
|
+ return urls
|
|
|
+
|
|
|
+
|
|
|
+def sanitext(text):
|
|
|
+ rettext = re.sub(r'^:\s+', '', text)
|
|
|
+ rettext = re.sub(r'\.$', '', rettext)
|
|
|
+ rettext = re.sub(r'â\x80\x93', '-', rettext)
|
|
|
+ rettext = re.sub(r'â\x80\x99', "'", rettext)
|
|
|
+ rettext = re.sub(r'ACG', '', rettext)
|
|
|
+ rettext = re.sub(r'APG', '', rettext)
|
|
|
+ rettext = re.sub(r'ARG', '', rettext)
|
|
|
+ rettext = re.sub(r'ISWG', '', rettext)
|
|
|
+ rettext = re.sub(r'OA', '', rettext)
|
|
|
+ rettext = re.sub(r'UC', '', rettext)
|
|
|
+ rettext = re.sub(r'UI', '', rettext)
|
|
|
+ rettext = re.sub(r'\(see the Pathfinder RPG Advanced Player\'s Guide\)', '', rettext)
|
|
|
+
|
|
|
+ return rettext.strip()
|
|
|
+
|
|
|
+
|
|
|
+def addreq(reqset, t, req):
|
|
|
+ if t in reqset:
|
|
|
+ if not isinstance(reqset[t], list):
|
|
|
+ reqset[t] = [reqset[t]]
|
|
|
+ reqset[t].append(req)
|
|
|
+ else:
|
|
|
+ reqset[t] = req
|
|
|
+
|
|
|
+ return reqset
|
|
|
+
|
|
|
+
|
|
|
+def parse_prereqs(reqtext, sep=r'[,;]'):
|
|
|
+ reqs = {}
|
|
|
+ for req in [sanitext(x).lower() for x in re.split(sep, reqtext)]:
|
|
|
+ if req.find(' or ') > -1:
|
|
|
+ reqs = addreq(reqs, 'or', parse_prereqs(req, sep=' or '))
|
|
|
+ elif req.find(' and ') > -1:
|
|
|
+ reqs = addreq(reqs, 'and', parse_prereqs(req, sep=' and '))
|
|
|
+ else:
|
|
|
+ m = re.search(r'^(\w{3}) (\d{2})$', req)
|
|
|
+ if m:
|
|
|
+ reqs = addreq(reqs, m.group(1), m.group(2))
|
|
|
+ else:
|
|
|
+ m = re.search(r'^base attack bonus \+(\d+)$', req)
|
|
|
+ if m:
|
|
|
+ reqs = addreq(reqs, 'bab', m.group(1))
|
|
|
+ else:
|
|
|
+ m = re.search(r'^(.*?) level (\d+)', req)
|
|
|
+ if m:
|
|
|
+ reqs = addreq(reqs, 'level', {m.group(1): m.group(2)})
|
|
|
+ else:
|
|
|
+ m = re.search(r'^(.*?) (\d+) rank', req)
|
|
|
+ if m:
|
|
|
+ reqs = addreq(reqs, 'skill', {m.group(1): m.group(2)})
|
|
|
+ else:
|
|
|
+ reqs = addreq(reqs, 'feat', req)
|
|
|
+
|
|
|
+
|
|
|
+ return reqs
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+def scrape_feat(url):
|
|
|
+ r = requests.get(url)
|
|
|
+ featpage = html.fromstring(r.content)
|
|
|
+ #featpage = html.parse('/home/jmelesky/code/featscraper/assets/feat-ald.html')
|
|
|
+
|
|
|
+ feat = {'prereqs': {},
|
|
|
+ 'benefit': '',
|
|
|
+ 'special': '',
|
|
|
+ 'trick': '',
|
|
|
+ }
|
|
|
+
|
|
|
+ feattexts = [x for x in
|
|
|
+ featpage.xpath('//span[@id="ctl00_MainContent_DataListTypes_ctl00_LabelName"]')[0].itertext()]
|
|
|
+
|
|
|
+ for i in range(0,len(feattexts)-1):
|
|
|
+ t = feattexts[i]
|
|
|
+ if i == 0:
|
|
|
+ if t.find('Teamwork') < 0:
|
|
|
+ feat['name'] = sanitext(re.sub(r'\(.*?\)', '', t))
|
|
|
+ else:
|
|
|
+ break
|
|
|
+ elif t == 'Benefit' and feat['benefit'] == '':
|
|
|
+ feat['benefit'] = sanitext(feattexts[i+1])
|
|
|
+ elif t == 'Prerequisites' and feat['benefit'] == '':
|
|
|
+ feat['prereqs'] = parse_prereqs(feattexts[i+1])
|
|
|
+ elif t == 'Special' and feat['special'] == '':
|
|
|
+ feat['special'] = sanitext(feattexts[i+1])
|
|
|
+ elif t == 'Combat Trick':
|
|
|
+ feat['trick'] = sanitext(feattexts[i+7])
|
|
|
+
|
|
|
+ if feat['benefit']:
|
|
|
+ return feat
|
|
|
+ else:
|
|
|
+ return None
|
|
|
+
|
|
|
+
|
|
|
+def scrape_feats(base, url):
|
|
|
+ urls = scrape_featlist(base + url)
|
|
|
+
|
|
|
+ feats = []
|
|
|
+ for url in urls:
|
|
|
+ print(url)
|
|
|
+ feat = scrape_feat(base + url)
|
|
|
+ if feat:
|
|
|
+ feats.append(feat)
|
|
|
+ time.sleep(.3)
|
|
|
+
|
|
|
+ with open('feats.pickle', 'wb') as f:
|
|
|
+ pickle.dump(feats, f)
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ scrape_feats('http://www.archivesofnethys.com/', 'Feats.aspx?Category=Combat')
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+# /html/body/div/form/div[3]/div[2]/div[2]/div/table/tbody/tr[2]/td[1]
|
|
|
+# #ctl00_MainContent_GridView6 > tbody:nth-child(1) > tr:nth-child(2) > td:nth-child(1)
|
|
|
+# html body.dark div#wrapper.clearfix form#aspnetForm div#page.page.clearfix div#main-wrapper.main-wrapper div#main.main div table#ctl00_MainContent_GridView6 tbody tr td
|
|
|
+
|
|
|
+# /html/body/div[2]/div
|
|
|
+# body > div:nth-child(2) > div:nth-child(1)
|