#!/usr/bin/env python3 from lxml import html import requests import re import time import pprint import pickle def scrape_featlist(url): r = requests.get(url) featspage = html.fromstring(r.content) #featspage = html.parse('/home/jmelesky/code/featscraper/assets/feats.html') urls = [] # for pages with subpages list for featanchor in featspage.xpath("//ul[@class='ogn-childpages']/li/a"): featurl = featanchor.attrib['href'] if (featurl.startswith('http:') and 'feats' in featurl and not featurl.endswith('teamwork') and not featurl.endswith('teamwork/')): urls.append(featurl) elif (not featurl.startswith('http:') and not featurl.endswith('teamwork') and not featurl.endswith('teamwork/')): urls.append(url + '/' + featurl) return urls def sanitext(text): rettext = re.sub(r'^:\s+', '', text) rettext = re.sub(r'\.$', '', rettext) rettext = re.sub(r'\r', '', rettext) rettext = re.sub(r'â\x80\x93', '-', rettext) rettext = re.sub(r'â\x80\x99', "'", rettext) rettext = re.sub(r'—', '\textemdash', rettext) rettext = re.sub(r'"', "''", rettext) rettext = re.sub(r'’', "'", rettext) rettext = re.sub(r'%', "\%", rettext) rettext = re.sub(r'—', " -- ", rettext) rettext = re.sub(r'–', "-", rettext) rettext = re.sub(r'ACG', '', rettext) rettext = re.sub(r'APG', '', rettext) rettext = re.sub(r'ARG', '', rettext) rettext = re.sub(r'ISWG', '', rettext) rettext = re.sub(r'OA', '', rettext) rettext = re.sub(r'UC', '', rettext) rettext = re.sub(r'UI', '', rettext) rettext = re.sub(r'\(see the Pathfinder RPG Advanced Player\'s Guide\)', '', rettext) return rettext.strip() def addreq(reqset, t, req): if t in reqset: if not isinstance(reqset[t], list): reqset[t] = [reqset[t]] reqset[t].append(req) else: reqset[t] = [req] return reqset def chomp(text): return text.split(' ', 1)[1] def parse_prereqs(reqtext, sep=r'[,;]'): reqs = {} for req in [sanitext(x).lower() for x in re.split(sep, reqtext)]: if req.find(' or ') > -1: reqs = addreq(reqs, 'or', parse_prereqs(req, sep=r'\Wor\W')) elif req.find(' and ') > -1: reqs = addreq(reqs, 'and', parse_prereqs(req, sep=r'\Wand\W')) else: m = re.search(r'^(\w{3}) (\d{1,2})\.?$', req) if m: reqs = addreq(reqs, m.group(1), m.group(2)) else: m = re.search(r'^dexterity (\d+)', req) if m: reqs = addreq(reqs, 'dex', m.group(1)) else: m = re.search(r'^strength (\d+)', req) if m: reqs = addreq(reqs, 'str', m.group(1)) else: m = re.search(r'^base\W+attack\W+bonus\W+\+{0,1}(\d+)', req) if m: reqs = addreq(reqs, 'bab', m.group(1)) else: m = re.search(r'^(.*?)\Wlevel (\d+)', req) if m: reqs = addreq(reqs, 'level', (m.group(1), m.group(2))) else: m = re.search(r'^(\d+).*level (.*)$', req) if m: reqs = addreq(reqs, 'level', (m.group(2), m.group(1))) else: m = re.search(r'^(.*?) (\d+) rank', req) if m: reqs = addreq(reqs, 'skill', (m.group(1), m.group(2))) else: reqs = addreq(reqs, 'feat', req) return reqs def scrape_feat(url): r = requests.get(url) featpage = html.fromstring(r.content) #featpage = html.parse('/home/jmelesky/code/featscraper/assets/feat-ald.html') feat = {'prereqs': {}, 'benefit': '', 'special': '', 'trick': '', } nametext = featpage.xpath('//h1')[0].text_content() if nametext.find('Teamwork') < 0: feat['name'] = sanitext(re.sub(r'\(.*?\)', '', nametext)) else: return None feattexts = [x for x in featpage.xpath('//div[@class="article-content"]/p')] for i in feattexts: t = i.text_content() if t.startswith('Benefit') and feat['benefit'] == '': feat['benefit'] = sanitext(chomp(t)) elif t.startswith('Prerequisite') and feat['benefit'] == '': feat['prereqs'] = parse_prereqs(chomp(t)) elif t.startswith('Special') and feat['special'] == '': feat['special'] = sanitext(chomp(t)) elif t.startswith('Combat Trick'): feat['trick'] = sanitext(chomp(t)) else: if feat['benefit'] != '' and not t.startswith('Normal'): feat['benefit'] += '\n\n' feat['benefit'] += sanitext(t) else: print('>>> ' + t) extralist = [x for x in featpage.xpath('//div[@class="article-content"]/ul/li')] for extra in extralist: feat['benefit'] += '\n\n' feat['benefit'] += '- ' + sanitext(extra.text_content()) if feat['benefit']: return feat else: return None def scrape_feats(baseurls): urls = [] for baseurl in baseurls: urls += scrape_featlist(baseurl) feats = [] for url in urls: feat = scrape_feat(url) if feat: feats.append(feat) time.sleep(.3) with open('feats.pickle', 'wb') as f: pickle.dump(feats, f) if __name__ == '__main__': scrape_feats(['http://www.d20pfsrd.com/feats/combat-feats/all-combat-feats', 'http://www.d20pfsrd.com/feats/armor-mastery-feats', 'http://www.d20pfsrd.com/feats/weapon-mastery-feats']) # /html/body/div/form/div[3]/div[2]/div[2]/div/table/tbody/tr[2]/td[1] # #ctl00_MainContent_GridView6 > tbody:nth-child(1) > tr:nth-child(2) > td:nth-child(1) # html body.dark div#wrapper.clearfix form#aspnetForm div#page.page.clearfix div#main-wrapper.main-wrapper div#main.main div table#ctl00_MainContent_GridView6 tbody tr td # /html/body/div[2]/div # body > div:nth-child(2) > div:nth-child(1)