Browse Source

updating for level 10 and d20pfsrd

jmelesky 6 years ago
parent
commit
b1917b5569
2 changed files with 44 additions and 16 deletions
  1. 2 2
      queryfeats.py
  2. 42 14
      scrapefeats.py

+ 2 - 2
queryfeats.py

@@ -12,11 +12,11 @@ charsheet = {
     'str': 20,
     'dex': 14,
     'con': 14,
-    'int': 14,
+    'int': 16,
     'wis': 10,
     'cha': 10,
     'level': [
-        ('fighter', 9)
+        ('fighter', 10)
     ],
     'feat': [
         'toughness',

+ 42 - 14
scrapefeats.py

@@ -9,13 +9,37 @@ import pickle
 
 
 def scrape_featlist(url):
-    #r = requests.get(url)
-    #featspage = html.fromstring(r.content)
-    featspage = html.parse('/home/jmelesky/code/featscraper/assets/feats.html')
+    r = requests.get(url)
+    featspage = html.fromstring(r.content)
+    #featspage = html.parse('/home/jmelesky/code/featscraper/assets/feats.html')
 
     urls = []
-    for featanchor in featspage.xpath('//table[@id="ctl00_MainContent_GridView6"]/tr/td[1]/a'):
-        urls.append(featanchor.attrib['href'])
+
+    # for pages with tables
+    for featanchor in featspage.xpath('//table/tbody/tr/td[1]/a'):
+        featurl = featanchor.attrib['href']
+        if (featurl.startswith('http:') and 'feats' in featurl
+            and not featurl.endswith('teamwork')
+            and not featurl.endswith('teamwork/')):
+            urls.append(featurl)
+        elif (not featurl.startswith('http:')
+              and not featurl.endswith('teamwork')
+              and not featurl.endswith('teamwork/')):
+            urls.append(url + '/' + featurl)
+
+
+    # for pages with subpages list
+    for featanchor in featspage.xpath("//ul[@class='ogn-childpages']/li/a"):
+        featurl = featanchor.attrib['href']
+        if (featurl.startswith('http:') and 'feats' in featurl
+            and not featurl.endswith('teamwork')
+            and not featurl.endswith('teamwork/')):
+            urls.append(featurl)
+        elif (not featurl.startswith('http:')
+              and not featurl.endswith('teamwork')
+              and not featurl.endswith('teamwork/')):
+            urls.append(url + '/' + featurl)
+
 
     return urls
 
@@ -130,25 +154,29 @@ def scrape_feat(url):
         return None
 
 
-def scrape_feats(base, url):
-    urls = scrape_featlist(base + url)
+def scrape_feats(baseurls):
+    urls = []
+    for baseurl in baseurls:
+        urls += scrape_featlist(baseurl)
 
     feats = []
     for url in urls:
         print(url)
-        feat = scrape_feat(base + url)
-        if feat:
-            feats.append(feat)
-        time.sleep(.3)
+         feat = scrape_feat(url)
+    #     if feat:
+    #         feats.append(feat)
+    #     time.sleep(.3)
 
-    with open('feats.pickle', 'wb') as f:
-        pickle.dump(feats, f)
+    # with open('feats.pickle', 'wb') as f:
+    #     pickle.dump(feats, f)
 
 
 
 
 if __name__ == '__main__':
-    scrape_feats('http://www.archivesofnethys.com/', 'Feats.aspx?Category=Combat')
+    scrape_feats(['http://www.d20pfsrd.com/feats/combat-feats',
+                  'http://www.d20pfsrd.com/feats/armor-mastery-feats',
+                  'http://www.d20pfsrd.com/feats/weapon-mastery-feats'])