7 years ago · b1917b5569
--- a/queryfeats.py
+++ b/queryfeats.py
@@ -12,11 +12,11 @@ charsheet = {
 
				     'str': 20,
			
 
				     'dex': 14,
			
 
				     'con': 14,
			
 
				-    'int': 14,
			
 
				+    'int': 16,
			
 
				     'wis': 10,
			
 
				     'cha': 10,
			
 
				     'level': [
			
 
				-        ('fighter', 9)
			
 
				+        ('fighter', 10)
			
 
				     ],
			
 
				     'feat': [
			
 
				         'toughness',
			
--- a/scrapefeats.py
+++ b/scrapefeats.py
@@ -9,13 +9,37 @@ import pickle
 
				 
			
 
				 
			
 
				 def scrape_featlist(url):
			
 
				-    #r = requests.get(url)
			
 
				-    #featspage = html.fromstring(r.content)
			
 
				-    featspage = html.parse('/home/jmelesky/code/featscraper/assets/feats.html')
			
 
				+    r = requests.get(url)
			
 
				+    featspage = html.fromstring(r.content)
			
 
				+    #featspage = html.parse('/home/jmelesky/code/featscraper/assets/feats.html')
			
 
				 
			
 
				     urls = []
			
 
				-    for featanchor in featspage.xpath('//table[@id="ctl00_MainContent_GridView6"]/tr/td[1]/a'):
			
 
				-        urls.append(featanchor.attrib['href'])
			
 
				+
			
 
				+    # for pages with tables
			
 
				+    for featanchor in featspage.xpath('//table/tbody/tr/td[1]/a'):
			
 
				+        featurl = featanchor.attrib['href']
			
 
				+        if (featurl.startswith('http:') and 'feats' in featurl
			
 
				+            and not featurl.endswith('teamwork')
			
 
				+            and not featurl.endswith('teamwork/')):
			
 
				+            urls.append(featurl)
			
 
				+        elif (not featurl.startswith('http:')
			
 
				+              and not featurl.endswith('teamwork')
			
 
				+              and not featurl.endswith('teamwork/')):
			
 
				+            urls.append(url + '/' + featurl)
			
 
				+
			
 
				+
			
 
				+    # for pages with subpages list
			
 
				+    for featanchor in featspage.xpath("//ul[@class='ogn-childpages']/li/a"):
			
 
				+        featurl = featanchor.attrib['href']
			
 
				+        if (featurl.startswith('http:') and 'feats' in featurl
			
 
				+            and not featurl.endswith('teamwork')
			
 
				+            and not featurl.endswith('teamwork/')):
			
 
				+            urls.append(featurl)
			
 
				+        elif (not featurl.startswith('http:')
			
 
				+              and not featurl.endswith('teamwork')
			
 
				+              and not featurl.endswith('teamwork/')):
			
 
				+            urls.append(url + '/' + featurl)
			
 
				+
			
 
				 
			
 
				     return urls
			
 
				 
			
@@ -130,25 +154,29 @@ def scrape_feat(url):
 
				         return None
			
 
				 
			
 
				 
			
 
				-def scrape_feats(base, url):
			
 
				-    urls = scrape_featlist(base + url)
			
 
				+def scrape_feats(baseurls):
			
 
				+    urls = []
			
 
				+    for baseurl in baseurls:
			
 
				+        urls += scrape_featlist(baseurl)
			
 
				 
			
 
				     feats = []
			
 
				     for url in urls:
			
 
				         print(url)
			
 
				-        feat = scrape_feat(base + url)
			
 
				-        if feat:
			
 
				-            feats.append(feat)
			
 
				-        time.sleep(.3)
			
 
				+         feat = scrape_feat(url)
			
 
				+    #     if feat:
			
 
				+    #         feats.append(feat)
			
 
				+    #     time.sleep(.3)
			
 
				 
			
 
				-    with open('feats.pickle', 'wb') as f:
			
 
				-        pickle.dump(feats, f)
			
 
				+    # with open('feats.pickle', 'wb') as f:
			
 
				+    #     pickle.dump(feats, f)
			
 
				 
			
 
				 
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				-    scrape_feats('http://www.archivesofnethys.com/', 'Feats.aspx?Category=Combat')
			
 
				+    scrape_feats(['http://www.d20pfsrd.com/feats/combat-feats',
			
 
				+                  'http://www.d20pfsrd.com/feats/armor-mastery-feats',
			
 
				+                  'http://www.d20pfsrd.com/feats/weapon-mastery-feats'])