|
@@ -9,13 +9,37 @@ import pickle
|
|
|
|
|
|
|
|
|
def scrape_featlist(url):
|
|
|
- #r = requests.get(url)
|
|
|
- #featspage = html.fromstring(r.content)
|
|
|
- featspage = html.parse('/home/jmelesky/code/featscraper/assets/feats.html')
|
|
|
+ r = requests.get(url)
|
|
|
+ featspage = html.fromstring(r.content)
|
|
|
+ #featspage = html.parse('/home/jmelesky/code/featscraper/assets/feats.html')
|
|
|
|
|
|
urls = []
|
|
|
- for featanchor in featspage.xpath('//table[@id="ctl00_MainContent_GridView6"]/tr/td[1]/a'):
|
|
|
- urls.append(featanchor.attrib['href'])
|
|
|
+
|
|
|
+ # for pages with tables
|
|
|
+ for featanchor in featspage.xpath('//table/tbody/tr/td[1]/a'):
|
|
|
+ featurl = featanchor.attrib['href']
|
|
|
+ if (featurl.startswith('http:') and 'feats' in featurl
|
|
|
+ and not featurl.endswith('teamwork')
|
|
|
+ and not featurl.endswith('teamwork/')):
|
|
|
+ urls.append(featurl)
|
|
|
+ elif (not featurl.startswith('http:')
|
|
|
+ and not featurl.endswith('teamwork')
|
|
|
+ and not featurl.endswith('teamwork/')):
|
|
|
+ urls.append(url + '/' + featurl)
|
|
|
+
|
|
|
+
|
|
|
+ # for pages with subpages list
|
|
|
+ for featanchor in featspage.xpath("//ul[@class='ogn-childpages']/li/a"):
|
|
|
+ featurl = featanchor.attrib['href']
|
|
|
+ if (featurl.startswith('http:') and 'feats' in featurl
|
|
|
+ and not featurl.endswith('teamwork')
|
|
|
+ and not featurl.endswith('teamwork/')):
|
|
|
+ urls.append(featurl)
|
|
|
+ elif (not featurl.startswith('http:')
|
|
|
+ and not featurl.endswith('teamwork')
|
|
|
+ and not featurl.endswith('teamwork/')):
|
|
|
+ urls.append(url + '/' + featurl)
|
|
|
+
|
|
|
|
|
|
return urls
|
|
|
|
|
@@ -130,25 +154,29 @@ def scrape_feat(url):
|
|
|
return None
|
|
|
|
|
|
|
|
|
-def scrape_feats(base, url):
|
|
|
- urls = scrape_featlist(base + url)
|
|
|
+def scrape_feats(baseurls):
|
|
|
+ urls = []
|
|
|
+ for baseurl in baseurls:
|
|
|
+ urls += scrape_featlist(baseurl)
|
|
|
|
|
|
feats = []
|
|
|
for url in urls:
|
|
|
print(url)
|
|
|
- feat = scrape_feat(base + url)
|
|
|
- if feat:
|
|
|
- feats.append(feat)
|
|
|
- time.sleep(.3)
|
|
|
+ feat = scrape_feat(url)
|
|
|
+ # if feat:
|
|
|
+ # feats.append(feat)
|
|
|
+ # time.sleep(.3)
|
|
|
|
|
|
- with open('feats.pickle', 'wb') as f:
|
|
|
- pickle.dump(feats, f)
|
|
|
+ # with open('feats.pickle', 'wb') as f:
|
|
|
+ # pickle.dump(feats, f)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
- scrape_feats('http://www.archivesofnethys.com/', 'Feats.aspx?Category=Combat')
|
|
|
+ scrape_feats(['http://www.d20pfsrd.com/feats/combat-feats',
|
|
|
+ 'http://www.d20pfsrd.com/feats/armor-mastery-feats',
|
|
|
+ 'http://www.d20pfsrd.com/feats/weapon-mastery-feats'])
|
|
|
|
|
|
|
|
|
|