123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145 |
- #!/usr/bin/env python3
- from lxml import html
- import requests
- import re
- import time
- import pprint
- import pickle
- def scrape_featlist(url):
- #r = requests.get(url)
- #featspage = html.fromstring(r.content)
- featspage = html.parse('/home/jmelesky/code/featscraper/assets/feats.html')
- urls = []
- for featanchor in featspage.xpath('//table[@id="ctl00_MainContent_GridView6"]/tr/td[1]/a'):
- urls.append(featanchor.attrib['href'])
- return urls
- def sanitext(text):
- rettext = re.sub(r'^:\s+', '', text)
- rettext = re.sub(r'\.$', '', rettext)
- rettext = re.sub(r'â\x80\x93', '-', rettext)
- rettext = re.sub(r'â\x80\x99', "'", rettext)
- rettext = re.sub(r'ACG', '', rettext)
- rettext = re.sub(r'APG', '', rettext)
- rettext = re.sub(r'ARG', '', rettext)
- rettext = re.sub(r'ISWG', '', rettext)
- rettext = re.sub(r'OA', '', rettext)
- rettext = re.sub(r'UC', '', rettext)
- rettext = re.sub(r'UI', '', rettext)
- rettext = re.sub(r'\(see the Pathfinder RPG Advanced Player\'s Guide\)', '', rettext)
- return rettext.strip()
- def addreq(reqset, t, req):
- if t in reqset:
- if not isinstance(reqset[t], list):
- reqset[t] = [reqset[t]]
- reqset[t].append(req)
- else:
- reqset[t] = req
- return reqset
- def parse_prereqs(reqtext, sep=r'[,;]'):
- reqs = {}
- for req in [sanitext(x).lower() for x in re.split(sep, reqtext)]:
- if req.find(' or ') > -1:
- reqs = addreq(reqs, 'or', parse_prereqs(req, sep=' or '))
- elif req.find(' and ') > -1:
- reqs = addreq(reqs, 'and', parse_prereqs(req, sep=' and '))
- else:
- m = re.search(r'^(\w{3}) (\d{2})$', req)
- if m:
- reqs = addreq(reqs, m.group(1), m.group(2))
- else:
- m = re.search(r'^base attack bonus \+(\d+)$', req)
- if m:
- reqs = addreq(reqs, 'bab', m.group(1))
- else:
- m = re.search(r'^(.*?) level (\d+)', req)
- if m:
- reqs = addreq(reqs, 'level', {m.group(1): m.group(2)})
- else:
- m = re.search(r'^(.*?) (\d+) rank', req)
- if m:
- reqs = addreq(reqs, 'skill', {m.group(1): m.group(2)})
- else:
- reqs = addreq(reqs, 'feat', req)
- return reqs
- def scrape_feat(url):
- r = requests.get(url)
- featpage = html.fromstring(r.content)
- #featpage = html.parse('/home/jmelesky/code/featscraper/assets/feat-ald.html')
- feat = {'prereqs': {},
- 'benefit': '',
- 'special': '',
- 'trick': '',
- }
- feattexts = [x for x in
- featpage.xpath('//span[@id="ctl00_MainContent_DataListTypes_ctl00_LabelName"]')[0].itertext()]
- for i in range(0,len(feattexts)-1):
- t = feattexts[i]
- if i == 0:
- if t.find('Teamwork') < 0:
- feat['name'] = sanitext(re.sub(r'\(.*?\)', '', t))
- else:
- break
- elif t == 'Benefit' and feat['benefit'] == '':
- feat['benefit'] = sanitext(feattexts[i+1])
- elif t == 'Prerequisites' and feat['benefit'] == '':
- feat['prereqs'] = parse_prereqs(feattexts[i+1])
- elif t == 'Special' and feat['special'] == '':
- feat['special'] = sanitext(feattexts[i+1])
- elif t == 'Combat Trick':
- feat['trick'] = sanitext(feattexts[i+7])
- if feat['benefit']:
- return feat
- else:
- return None
- def scrape_feats(base, url):
- urls = scrape_featlist(base + url)
- feats = []
- for url in urls:
- print(url)
- feat = scrape_feat(base + url)
- if feat:
- feats.append(feat)
- time.sleep(.3)
- with open('feats.pickle', 'wb') as f:
- pickle.dump(feats, f)
- if __name__ == '__main__':
- scrape_feats('http://www.archivesofnethys.com/', 'Feats.aspx?Category=Combat')
- # /html/body/div/form/div[3]/div[2]/div[2]/div/table/tbody/tr[2]/td[1]
- # #ctl00_MainContent_GridView6 > tbody:nth-child(1) > tr:nth-child(2) > td:nth-child(1)
- # html body.dark div#wrapper.clearfix form#aspnetForm div#page.page.clearfix div#main-wrapper.main-wrapper div#main.main div table#ctl00_MainContent_GridView6 tbody tr td
- # /html/body/div[2]/div
- # body > div:nth-child(2) > div:nth-child(1)
|