#!/usr/bin/env python3
from lxml import html
import requests
import re
import time
import pprint
import pickle
def scrape_featlist(url):
    r = requests.get(url)
    featspage = html.fromstring(r.content)
    #featspage = html.parse('/home/jmelesky/code/featscraper/assets/feats.html')
    urls = []
    # for pages with subpages list
    for featanchor in featspage.xpath("//ul[@class='ogn-childpages']/li/a"):
        featurl = featanchor.attrib['href']
        if (featurl.startswith('http:') and 'feats' in featurl
            and not featurl.endswith('teamwork')
            and not featurl.endswith('teamwork/')):
            urls.append(featurl)
        elif (not featurl.startswith('http:')
              and not featurl.endswith('teamwork')
              and not featurl.endswith('teamwork/')):
            urls.append(url + '/' + featurl)
    return urls
def sanitext(text):
    rettext = re.sub(r'^:\s+', '', text)
    rettext = re.sub(r'\.$', '', rettext)
    rettext = re.sub(r'\r', '', rettext)
    rettext = re.sub(r'â\x80\x93', '-', rettext)
    rettext = re.sub(r'â\x80\x99', "'", rettext)
    rettext = re.sub(r'â', '\textemdash', rettext)
    rettext = re.sub(r'"', "''", rettext)
    rettext = re.sub(r'’', "'", rettext)
    rettext = re.sub(r'%', "\%", rettext)
    rettext = re.sub(r'—', " -- ", rettext)
    rettext = re.sub(r'–', "-", rettext)
    rettext = re.sub(r'ACG', '', rettext)
    rettext = re.sub(r'APG', '', rettext)
    rettext = re.sub(r'ARG', '', rettext)
    rettext = re.sub(r'ISWG', '', rettext)
    rettext = re.sub(r'OA', '', rettext)
    rettext = re.sub(r'UC', '', rettext)
    rettext = re.sub(r'UI', '', rettext)
    rettext = re.sub(r'\(see the Pathfinder RPG Advanced Player\'s Guide\)', '', rettext)
    return rettext.strip()
def addreq(reqset, t, req):
    if t in reqset:
        if not isinstance(reqset[t], list):
            reqset[t] = [reqset[t]]
        reqset[t].append(req)
    else:
        reqset[t] = [req]
    return reqset
def chomp(text):
    return text.split(' ', 1)[1]
def parse_prereqs(reqtext, sep=r'[,;]'):
    reqs = {}
    for req in [sanitext(x).lower() for x in re.split(sep, reqtext)]:
        if req.find(' or ') > -1:
            reqs = addreq(reqs, 'or', parse_prereqs(req, sep=r'\Wor\W'))
        elif req.find(' and ') > -1:
            reqs = addreq(reqs, 'and', parse_prereqs(req, sep=r'\Wand\W'))
        else:
            m = re.search(r'^(\w{3}) (\d{1,2})\.?$', req)
            if m:
                reqs = addreq(reqs, m.group(1), m.group(2))
            else:
                m = re.search(r'^dexterity (\d+)', req)
                if m:
                    reqs = addreq(reqs, 'dex', m.group(1))
                else:
                    m = re.search(r'^strength (\d+)', req)
                    if m:
                        reqs = addreq(reqs, 'str', m.group(1))
                    else:
                        m = re.search(r'^base\W+attack\W+bonus\W+\+{0,1}(\d+)', req)
                        if m:
                            reqs = addreq(reqs, 'bab', m.group(1))
                        else:
                            m = re.search(r'^(.*?)\Wlevel (\d+)', req)
                            if m:
                                reqs = addreq(reqs, 'level', (m.group(1), m.group(2)))
                            else:
                                m = re.search(r'^(\d+).*level (.*)$', req)
                                if m:
                                    reqs = addreq(reqs, 'level', (m.group(2), m.group(1)))
                                else:
                                    m = re.search(r'^(.*?) (\d+) rank', req)
                                    if m:
                                        reqs = addreq(reqs, 'skill', (m.group(1), m.group(2)))
                                    else:
                                        reqs = addreq(reqs, 'feat', req)
    return reqs
def scrape_feat(url):
    r = requests.get(url)
    featpage = html.fromstring(r.content)
    #featpage = html.parse('/home/jmelesky/code/featscraper/assets/feat-ald.html')
    feat = {'prereqs': {},
            'benefit': '',
            'special': '',
            'trick': '',
    }
    nametext = featpage.xpath('//h1')[0].text_content()
    if nametext.find('Teamwork') < 0:
        feat['name'] = sanitext(re.sub(r'\(.*?\)', '', nametext))
    else:
        return None
    feattexts = [x for x in
                 featpage.xpath('//div[@class="article-content"]/p')]
    for i in feattexts:
        t = i.text_content()
        if t.startswith('Benefit') and feat['benefit'] == '':
            feat['benefit'] = sanitext(chomp(t))
        elif t.startswith('Prerequisite') and feat['benefit'] == '':
            feat['prereqs'] = parse_prereqs(chomp(t))
        elif t.startswith('Special') and feat['special'] == '':
            feat['special'] = sanitext(chomp(t))
        elif t.startswith('Combat Trick'):
            feat['trick'] = sanitext(chomp(t))
        else:
            if feat['benefit'] != '' and not t.startswith('Normal'):
                feat['benefit'] += '\n\n'
                feat['benefit'] += sanitext(t)
            else:
                print('>>> ' + t)
    extralist = [x for x in
                 featpage.xpath('//div[@class="article-content"]/ul/li')]
    for extra in extralist:
        feat['benefit'] += '\n\n'
        feat['benefit'] += '- ' + sanitext(extra.text_content())
    if feat['benefit']:
        return feat
    else:
        return None
def scrape_feats(baseurls):
    urls = []
    for baseurl in baseurls:
        urls += scrape_featlist(baseurl)
    feats = []
    for url in urls:
        feat = scrape_feat(url)
        if feat:
            feats.append(feat)
        time.sleep(.3)
    with open('feats.pickle', 'wb') as f:
        pickle.dump(feats, f)
if __name__ == '__main__':
    scrape_feats(['http://www.d20pfsrd.com/feats/combat-feats/all-combat-feats',
                  'http://www.d20pfsrd.com/feats/armor-mastery-feats',
                  'http://www.d20pfsrd.com/feats/weapon-mastery-feats'])
# /html/body/div/form/div[3]/div[2]/div[2]/div/table/tbody/tr[2]/td[1]
# #ctl00_MainContent_GridView6 > tbody:nth-child(1) > tr:nth-child(2) > td:nth-child(1)
# html body.dark div#wrapper.clearfix form#aspnetForm div#page.page.clearfix div#main-wrapper.main-wrapper div#main.main div table#ctl00_MainContent_GridView6 tbody tr td
# /html/body/div[2]/div
# body > div:nth-child(2) > div:nth-child(1)