jmelesky 7 年之前
父节点
当前提交
497adb714b
共有 3 个文件被更改,包括 259 次插入0 次删除
  1. 14 0
      Pipfile
  2. 100 0
      Pipfile.lock
  3. 145 0
      scrapefeats.py

+ 14 - 0
Pipfile

@@ -0,0 +1,14 @@
+[[source]]
+url = "https://pypi.python.org/simple"
+verify_ssl = true
+name = "pypi"
+
+[packages]
+requests = "*"
+lxml = "*"
+"beautifulsoup4" = "*"
+
+[dev-packages]
+
+[requires]
+python_version = "3.6"

+ 100 - 0
Pipfile.lock

@@ -0,0 +1,100 @@
+{
+    "_meta": {
+        "hash": {
+            "sha256": "cf67a571f94de6511c09a3a20817ec686ba008d8a11837365f283c34fc7ec080"
+        },
+        "pipfile-spec": 6,
+        "requires": {
+            "python_version": "3.6"
+        },
+        "sources": [
+            {
+                "name": "pypi",
+                "url": "https://pypi.python.org/simple",
+                "verify_ssl": true
+            }
+        ]
+    },
+    "default": {
+        "beautifulsoup4": {
+            "hashes": [
+                "sha256:11a9a27b7d3bddc6d86f59fb76afb70e921a25ac2d6cc55b40d072bd68435a76",
+                "sha256:7015e76bf32f1f574636c4288399a6de66ce08fb7b2457f628a8d70c0fbabb11",
+                "sha256:808b6ac932dccb0a4126558f7dfdcf41710dd44a4ef497a0bb59a77f9f078e89"
+            ],
+            "index": "pypi",
+            "version": "==4.6.0"
+        },
+        "certifi": {
+            "hashes": [
+                "sha256:14131608ad2fd56836d33a71ee60fa1c82bc9d2c8d98b7bdbc631fe1b3cd1296",
+                "sha256:edbc3f203427eef571f79a7692bb160a2b0f7ccaa31953e99bd17e307cf63f7d"
+            ],
+            "version": "==2018.1.18"
+        },
+        "chardet": {
+            "hashes": [
+                "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae",
+                "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"
+            ],
+            "version": "==3.0.4"
+        },
+        "idna": {
+            "hashes": [
+                "sha256:2c6a5de3089009e3da7c5dde64a141dbc8551d5b7f6cf4ed7c2568d0cc520a8f",
+                "sha256:8c7309c718f94b3a625cb648ace320157ad16ff131ae0af362c9f21b80ef6ec4"
+            ],
+            "version": "==2.6"
+        },
+        "lxml": {
+            "hashes": [
+                "sha256:01c45df6d90497c20aa2a07789a41941f9a1029faa30bf725fc7f6d515b1afe9",
+                "sha256:0c9fef4f8d444e337df96c54544aeb85b7215b2ed7483bb6c35de97ac99f1bcd",
+                "sha256:0e3cd94c95d30ba9ca3cff40e9b2a14e1a10a4fd8131105b86c6b61648f57e4b",
+                "sha256:0e7996e9b46b4d8b4ac1c329a00e2d10edcd8380b95d2a676fccabf4c1dd0512",
+                "sha256:1858b1933d483ec5727549d3fe166eeb54229fbd6a9d3d7ea26d2c8a28048058",
+                "sha256:1b164bba1320b14905dcff77da10d5ce9c411ac4acc4fb4ed9a2a4d10fae38c9",
+                "sha256:1b46f37927fa6cd1f3fe34b54f1a23bd5bea1d905657289e08e1297069a1a597",
+                "sha256:231047b05907315ae9a9b6925751f9fd2c479cf7b100fff62485a25e382ca0d4",
+                "sha256:28f0c6652c1b130f1e576b60532f84b19379485eb8da6185c29bd8c9c9bc97bf",
+                "sha256:34d49d0f72dd82b9530322c48b70ac78cca0911275da741c3b1d2f3603c5f295",
+                "sha256:3682a17fbf72d56d7e46db2e80ca23850b79c28cfe75dcd9b82f58808f730909",
+                "sha256:3cf2830b9a6ad7f6e965fa53a768d4d2372a7856f20ffa6ce43d2fe9c0d34b19",
+                "sha256:5b653c9379ce29ce271fbe1010c5396670f018e78b643e21beefbb3dc6d291de",
+                "sha256:65a272821d5d8194358d6b46f3ca727fa56a6b63981606eac737c86d27309cdd",
+                "sha256:691f2cd97cf026c611df1ea5055755eec7f878f2d4f4330dc8686583de6fc5fd",
+                "sha256:6b6379495d3baacf7ed755ac68547c8dff6ce5d37bf370f0b7678888dc1283f9",
+                "sha256:75322a531504d4f383264391d89993a42e286da8821ddc5ac315e57305cb84f0",
+                "sha256:7f457cbda964257f443bac861d3a36732dcba8183149e7818ee2fb7c86901b94",
+                "sha256:7ff1fc76d8804e0f870c343a72007ff587090c218b0f92d8ee784ac2b6eaf5b9",
+                "sha256:8523fbde9c2216f3f2b950cb01ebe52e785eaa8a07ffeb456dd3576ca1b4fb9b",
+                "sha256:8f37627f16e026523fca326f1b5c9a43534862fede6c3e99c2ba6a776d75c1ab",
+                "sha256:a7182ea298cc3555ea56ffbb0748fe0d5e0d81451e2bc16d7f4645cd01b1ca70",
+                "sha256:abbd2fb4a5a04c11b5e04eb146659a0cf67bb237dd3d7ca3b9994d3a9f826e55",
+                "sha256:accc9f6b77bed0a6f267b4fae120f6008a951193d548cdbe9b61fc98a08b1cf8",
+                "sha256:bd88c8ce0d1504fdfd96a35911dd4f3edfb2e560d7cfdb5a3d09aa571ae5fbae",
+                "sha256:c557ad647facb3c0027a9d0af58853f905e85a0a2f04dcb73f8e665272fcdc3a",
+                "sha256:defabb7fbb99f9f7b3e0b24b286a46855caef4776495211b066e9e6592d12b04",
+                "sha256:e2629cdbcad82b83922a3488937632a4983ecc0fed3e5cfbf430d069382eeb9b"
+            ],
+            "index": "pypi",
+            "version": "==4.2.1"
+        },
+        "requests": {
+            "hashes": [
+                "sha256:6a1b267aa90cac58ac3a765d067950e7dbbf75b1da07e895d1f594193a40a38b",
+                "sha256:9c443e7324ba5b85070c4a818ade28bfabedf16ea10206da1132edaa6dda237e"
+            ],
+            "index": "pypi",
+            "version": "==2.18.4"
+        },
+        "urllib3": {
+            "hashes": [
+                "sha256:06330f386d6e4b195fbfc736b297f58c5a892e4440e54d294d7004e3a9bbea1b",
+                "sha256:cc44da8e1145637334317feebd728bd869a35285b93cbb4cca2577da7e62db4f"
+            ],
+            "version": "==1.22"
+        }
+    },
+    "develop": {}
+}

+ 145 - 0
scrapefeats.py

@@ -0,0 +1,145 @@
+#!/usr/bin/env python3
+
+from lxml import html
+import requests
+import re
+import time
+import pprint
+import pickle
+
+
+def scrape_featlist(url):
+    #r = requests.get(url)
+    #featspage = html.fromstring(r.content)
+    featspage = html.parse('/home/jmelesky/code/featscraper/assets/feats.html')
+
+    urls = []
+    for featanchor in featspage.xpath('//table[@id="ctl00_MainContent_GridView6"]/tr/td[1]/a'):
+        urls.append(featanchor.attrib['href'])
+
+    return urls
+
+
+def sanitext(text):
+    rettext = re.sub(r'^:\s+', '', text)
+    rettext = re.sub(r'\.$', '', rettext)
+    rettext = re.sub(r'â\x80\x93', '-', rettext)
+    rettext = re.sub(r'â\x80\x99', "'", rettext)
+    rettext = re.sub(r'ACG', '', rettext)
+    rettext = re.sub(r'APG', '', rettext)
+    rettext = re.sub(r'ARG', '', rettext)
+    rettext = re.sub(r'ISWG', '', rettext)
+    rettext = re.sub(r'OA', '', rettext)
+    rettext = re.sub(r'UC', '', rettext)
+    rettext = re.sub(r'UI', '', rettext)
+    rettext = re.sub(r'\(see the Pathfinder RPG Advanced Player\'s Guide\)', '', rettext)
+
+    return rettext.strip()
+
+
+def addreq(reqset, t, req):
+    if t in reqset:
+        if not isinstance(reqset[t], list):
+            reqset[t] = [reqset[t]]
+        reqset[t].append(req)
+    else:
+        reqset[t] = req
+
+    return reqset
+
+
+def parse_prereqs(reqtext, sep=r'[,;]'):
+    reqs = {}
+    for req in [sanitext(x).lower() for x in re.split(sep, reqtext)]:
+        if req.find(' or ') > -1:
+            reqs = addreq(reqs, 'or', parse_prereqs(req, sep=' or '))
+        elif req.find(' and ') > -1:
+            reqs = addreq(reqs, 'and', parse_prereqs(req, sep=' and '))
+        else:
+            m = re.search(r'^(\w{3}) (\d{2})$', req)
+            if m:
+                reqs = addreq(reqs, m.group(1), m.group(2))
+            else:
+                m = re.search(r'^base attack bonus \+(\d+)$', req)
+                if m:
+                    reqs = addreq(reqs, 'bab', m.group(1))
+                else:
+                    m = re.search(r'^(.*?) level (\d+)', req)
+                    if m:
+                        reqs = addreq(reqs, 'level', {m.group(1): m.group(2)})
+                    else:
+                        m = re.search(r'^(.*?) (\d+) rank', req)
+                        if m:
+                            reqs = addreq(reqs, 'skill', {m.group(1): m.group(2)})
+                        else:
+                            reqs = addreq(reqs, 'feat', req)
+
+
+    return reqs
+
+
+
+def scrape_feat(url):
+    r = requests.get(url)
+    featpage = html.fromstring(r.content)
+    #featpage = html.parse('/home/jmelesky/code/featscraper/assets/feat-ald.html')
+
+    feat = {'prereqs': {},
+            'benefit': '',
+            'special': '',
+            'trick': '',
+    }
+
+    feattexts = [x for x in
+                 featpage.xpath('//span[@id="ctl00_MainContent_DataListTypes_ctl00_LabelName"]')[0].itertext()]
+
+    for i in range(0,len(feattexts)-1):
+        t = feattexts[i]
+        if i == 0:
+            if t.find('Teamwork') < 0:
+                feat['name'] = sanitext(re.sub(r'\(.*?\)', '', t))
+            else:
+                break
+        elif t == 'Benefit' and feat['benefit'] == '':
+            feat['benefit'] = sanitext(feattexts[i+1])
+        elif t == 'Prerequisites' and feat['benefit'] == '':
+            feat['prereqs'] = parse_prereqs(feattexts[i+1])
+        elif t == 'Special' and feat['special'] == '':
+            feat['special'] = sanitext(feattexts[i+1])
+        elif t == 'Combat Trick':
+            feat['trick'] = sanitext(feattexts[i+7])
+
+    if feat['benefit']:
+        return feat
+    else:
+        return None
+
+
+def scrape_feats(base, url):
+    urls = scrape_featlist(base + url)
+
+    feats = []
+    for url in urls:
+        print(url)
+        feat = scrape_feat(base + url)
+        if feat:
+            feats.append(feat)
+        time.sleep(.3)
+
+    with open('feats.pickle', 'wb') as f:
+        pickle.dump(feats, f)
+
+
+
+
+if __name__ == '__main__':
+    scrape_feats('http://www.archivesofnethys.com/', 'Feats.aspx?Category=Combat')
+
+
+
+# /html/body/div/form/div[3]/div[2]/div[2]/div/table/tbody/tr[2]/td[1]
+# #ctl00_MainContent_GridView6 > tbody:nth-child(1) > tr:nth-child(2) > td:nth-child(1)
+# html body.dark div#wrapper.clearfix form#aspnetForm div#page.page.clearfix div#main-wrapper.main-wrapper div#main.main div table#ctl00_MainContent_GridView6 tbody tr td
+
+# /html/body/div[2]/div
+# body > div:nth-child(2) > div:nth-child(1)