scrapefeats.py 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188
  1. #!/usr/bin/env python3
  2. from lxml import html
  3. import requests
  4. import re
  5. import time
  6. import pprint
  7. import pickle
  8. def scrape_featlist(url):
  9. r = requests.get(url)
  10. featspage = html.fromstring(r.content)
  11. #featspage = html.parse('/home/jmelesky/code/featscraper/assets/feats.html')
  12. urls = []
  13. # for pages with tables
  14. for featanchor in featspage.xpath('//table/tbody/tr/td[1]/a'):
  15. featurl = featanchor.attrib['href']
  16. if (featurl.startswith('http:') and 'feats' in featurl
  17. and not featurl.endswith('teamwork')
  18. and not featurl.endswith('teamwork/')):
  19. urls.append(featurl)
  20. elif (not featurl.startswith('http:')
  21. and not featurl.endswith('teamwork')
  22. and not featurl.endswith('teamwork/')):
  23. urls.append(url + '/' + featurl)
  24. # for pages with subpages list
  25. for featanchor in featspage.xpath("//ul[@class='ogn-childpages']/li/a"):
  26. featurl = featanchor.attrib['href']
  27. if (featurl.startswith('http:') and 'feats' in featurl
  28. and not featurl.endswith('teamwork')
  29. and not featurl.endswith('teamwork/')):
  30. urls.append(featurl)
  31. elif (not featurl.startswith('http:')
  32. and not featurl.endswith('teamwork')
  33. and not featurl.endswith('teamwork/')):
  34. urls.append(url + '/' + featurl)
  35. return urls
  36. def sanitext(text):
  37. rettext = re.sub(r'^:\s+', '', text)
  38. rettext = re.sub(r'\.$', '', rettext)
  39. rettext = re.sub(r'\r', '', rettext)
  40. rettext = re.sub(r'â\x80\x93', '-', rettext)
  41. rettext = re.sub(r'â\x80\x99', "'", rettext)
  42. rettext = re.sub(r'—', '\textemdash', rettext)
  43. rettext = re.sub(r'"', "''", rettext)
  44. rettext = re.sub(r'ACG', '', rettext)
  45. rettext = re.sub(r'APG', '', rettext)
  46. rettext = re.sub(r'ARG', '', rettext)
  47. rettext = re.sub(r'ISWG', '', rettext)
  48. rettext = re.sub(r'OA', '', rettext)
  49. rettext = re.sub(r'UC', '', rettext)
  50. rettext = re.sub(r'UI', '', rettext)
  51. rettext = re.sub(r'\(see the Pathfinder RPG Advanced Player\'s Guide\)', '', rettext)
  52. return rettext.strip()
  53. def addreq(reqset, t, req):
  54. if t in reqset:
  55. if not isinstance(reqset[t], list):
  56. reqset[t] = [reqset[t]]
  57. reqset[t].append(req)
  58. else:
  59. reqset[t] = [req]
  60. return reqset
  61. def parse_prereqs(reqtext, sep=r'[,;]'):
  62. reqs = {}
  63. for req in [sanitext(x).lower() for x in re.split(sep, reqtext)]:
  64. if req.find(' or ') > -1:
  65. reqs = addreq(reqs, 'or', parse_prereqs(req, sep=r'\Wor\W'))
  66. elif req.find(' and ') > -1:
  67. reqs = addreq(reqs, 'and', parse_prereqs(req, sep=r'\Wand\W'))
  68. else:
  69. m = re.search(r'^(\w{3}) (\d{1,2})\.?$', req)
  70. if m:
  71. reqs = addreq(reqs, m.group(1), m.group(2))
  72. else:
  73. m = re.search(r'^dexterity (\d+)', req)
  74. if m:
  75. reqs = addreq(reqs, 'dex', m.group(1))
  76. else:
  77. m = re.search(r'^strength (\d+)', req)
  78. if m:
  79. reqs = addreq(reqs, 'str', m.group(1))
  80. else:
  81. m = re.search(r'^base\W+attack\W+bonus\W+\+{0,1}(\d+)', req)
  82. if m:
  83. reqs = addreq(reqs, 'bab', m.group(1))
  84. else:
  85. m = re.search(r'^(.*?)\Wlevel (\d+)', req)
  86. if m:
  87. reqs = addreq(reqs, 'level', (m.group(1), m.group(2)))
  88. else:
  89. m = re.search(r'^(\d+).*level (.*)$', req)
  90. if m:
  91. reqs = addreq(reqs, 'level', (m.group(2), m.group(1)))
  92. else:
  93. m = re.search(r'^(.*?) (\d+) rank', req)
  94. if m:
  95. reqs = addreq(reqs, 'skill', (m.group(1), m.group(2)))
  96. else:
  97. reqs = addreq(reqs, 'feat', req)
  98. return reqs
  99. def scrape_feat(url):
  100. r = requests.get(url)
  101. featpage = html.fromstring(r.content)
  102. #featpage = html.parse('/home/jmelesky/code/featscraper/assets/feat-ald.html')
  103. feat = {'prereqs': {},
  104. 'benefit': '',
  105. 'special': '',
  106. 'trick': '',
  107. }
  108. feattexts = [x for x in
  109. featpage.xpath('//span[@id="ctl00_MainContent_DataListTypes_ctl00_LabelName"]')[0].itertext()]
  110. for i in range(0,len(feattexts)-1):
  111. t = feattexts[i]
  112. if i == 0:
  113. if t.find('Teamwork') < 0:
  114. feat['name'] = sanitext(re.sub(r'\(.*?\)', '', t))
  115. else:
  116. break
  117. elif t == 'Benefit' and feat['benefit'] == '':
  118. feat['benefit'] = sanitext(feattexts[i+1])
  119. elif t == 'Prerequisites' and feat['benefit'] == '':
  120. feat['prereqs'] = parse_prereqs(feattexts[i+1])
  121. elif t == 'Special' and feat['special'] == '':
  122. feat['special'] = sanitext(feattexts[i+1])
  123. elif t == 'Combat Trick':
  124. feat['trick'] = sanitext(feattexts[i+7])
  125. if feat['benefit']:
  126. return feat
  127. else:
  128. return None
  129. def scrape_feats(baseurls):
  130. urls = []
  131. for baseurl in baseurls:
  132. urls += scrape_featlist(baseurl)
  133. feats = []
  134. for url in urls:
  135. print(url)
  136. feat = scrape_feat(url)
  137. # if feat:
  138. # feats.append(feat)
  139. # time.sleep(.3)
  140. # with open('feats.pickle', 'wb') as f:
  141. # pickle.dump(feats, f)
  142. if __name__ == '__main__':
  143. scrape_feats(['http://www.d20pfsrd.com/feats/combat-feats',
  144. 'http://www.d20pfsrd.com/feats/armor-mastery-feats',
  145. 'http://www.d20pfsrd.com/feats/weapon-mastery-feats'])
  146. # /html/body/div/form/div[3]/div[2]/div[2]/div/table/tbody/tr[2]/td[1]
  147. # #ctl00_MainContent_GridView6 > tbody:nth-child(1) > tr:nth-child(2) > td:nth-child(1)
  148. # html body.dark div#wrapper.clearfix form#aspnetForm div#page.page.clearfix div#main-wrapper.main-wrapper div#main.main div table#ctl00_MainContent_GridView6 tbody tr td
  149. # /html/body/div[2]/div
  150. # body > div:nth-child(2) > div:nth-child(1)