scrapefeats.py 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197
  1. #!/usr/bin/env python3
  2. from lxml import html
  3. import requests
  4. import re
  5. import time
  6. import pprint
  7. import pickle
  8. def scrape_featlist(url):
  9. r = requests.get(url)
  10. featspage = html.fromstring(r.content)
  11. #featspage = html.parse('/home/jmelesky/code/featscraper/assets/feats.html')
  12. urls = []
  13. # for pages with subpages list
  14. for featanchor in featspage.xpath("//ul[@class='ogn-childpages']/li/a"):
  15. featurl = featanchor.attrib['href']
  16. if (featurl.startswith('http:') and 'feats' in featurl
  17. and not featurl.endswith('teamwork')
  18. and not featurl.endswith('teamwork/')):
  19. urls.append(featurl)
  20. elif (not featurl.startswith('http:')
  21. and not featurl.endswith('teamwork')
  22. and not featurl.endswith('teamwork/')):
  23. urls.append(url + '/' + featurl)
  24. return urls
  25. def sanitext(text):
  26. rettext = re.sub(r'^:\s+', '', text)
  27. rettext = re.sub(r'\.$', '', rettext)
  28. rettext = re.sub(r'\r', '', rettext)
  29. rettext = re.sub(r'â\x80\x93', '-', rettext)
  30. rettext = re.sub(r'â\x80\x99', "'", rettext)
  31. rettext = re.sub(r'—', '\textemdash', rettext)
  32. rettext = re.sub(r'"', "''", rettext)
  33. rettext = re.sub(r'’', "'", rettext)
  34. rettext = re.sub(r'%', "\%", rettext)
  35. rettext = re.sub(r'—', " -- ", rettext)
  36. rettext = re.sub(r'–', "-", rettext)
  37. rettext = re.sub(r'ACG', '', rettext)
  38. rettext = re.sub(r'APG', '', rettext)
  39. rettext = re.sub(r'ARG', '', rettext)
  40. rettext = re.sub(r'ISWG', '', rettext)
  41. rettext = re.sub(r'OA', '', rettext)
  42. rettext = re.sub(r'UC', '', rettext)
  43. rettext = re.sub(r'UI', '', rettext)
  44. rettext = re.sub(r'\(see the Pathfinder RPG Advanced Player\'s Guide\)', '', rettext)
  45. return rettext.strip()
  46. def addreq(reqset, t, req):
  47. if t in reqset:
  48. if not isinstance(reqset[t], list):
  49. reqset[t] = [reqset[t]]
  50. reqset[t].append(req)
  51. else:
  52. reqset[t] = [req]
  53. return reqset
  54. def chomp(text):
  55. return text.split(' ', 1)[1]
  56. def parse_prereqs(reqtext, sep=r'[,;]'):
  57. reqs = {}
  58. for req in [sanitext(x).lower() for x in re.split(sep, reqtext)]:
  59. if req.find(' or ') > -1:
  60. reqs = addreq(reqs, 'or', parse_prereqs(req, sep=r'\Wor\W'))
  61. elif req.find(' and ') > -1:
  62. reqs = addreq(reqs, 'and', parse_prereqs(req, sep=r'\Wand\W'))
  63. else:
  64. m = re.search(r'^(\w{3}) (\d{1,2})\.?$', req)
  65. if m:
  66. reqs = addreq(reqs, m.group(1), m.group(2))
  67. else:
  68. m = re.search(r'^dexterity (\d+)', req)
  69. if m:
  70. reqs = addreq(reqs, 'dex', m.group(1))
  71. else:
  72. m = re.search(r'^strength (\d+)', req)
  73. if m:
  74. reqs = addreq(reqs, 'str', m.group(1))
  75. else:
  76. m = re.search(r'^base\W+attack\W+bonus\W+\+{0,1}(\d+)', req)
  77. if m:
  78. reqs = addreq(reqs, 'bab', m.group(1))
  79. else:
  80. m = re.search(r'^(.*?)\Wlevel (\d+)', req)
  81. if m:
  82. reqs = addreq(reqs, 'level', (m.group(1), m.group(2)))
  83. else:
  84. m = re.search(r'^(\d+).*level (.*)$', req)
  85. if m:
  86. reqs = addreq(reqs, 'level', (m.group(2), m.group(1)))
  87. else:
  88. m = re.search(r'^(.*?) (\d+) rank', req)
  89. if m:
  90. reqs = addreq(reqs, 'skill', (m.group(1), m.group(2)))
  91. else:
  92. reqs = addreq(reqs, 'feat', req)
  93. return reqs
  94. def scrape_feat(url):
  95. r = requests.get(url)
  96. featpage = html.fromstring(r.content)
  97. #featpage = html.parse('/home/jmelesky/code/featscraper/assets/feat-ald.html')
  98. feat = {'prereqs': {},
  99. 'benefit': '',
  100. 'special': '',
  101. 'trick': '',
  102. }
  103. nametext = featpage.xpath('//h1')[0].text_content()
  104. if nametext.find('Teamwork') < 0:
  105. feat['name'] = sanitext(re.sub(r'\(.*?\)', '', nametext))
  106. else:
  107. return None
  108. feattexts = [x for x in
  109. featpage.xpath('//div[@class="article-content"]/p')]
  110. for i in feattexts:
  111. t = i.text_content()
  112. if t.startswith('Benefit') and feat['benefit'] == '':
  113. feat['benefit'] = sanitext(chomp(t))
  114. elif t.startswith('Prerequisite') and feat['benefit'] == '':
  115. feat['prereqs'] = parse_prereqs(chomp(t))
  116. elif t.startswith('Special') and feat['special'] == '':
  117. feat['special'] = sanitext(chomp(t))
  118. elif t.startswith('Combat Trick'):
  119. feat['trick'] = sanitext(chomp(t))
  120. else:
  121. if feat['benefit'] != '' and not t.startswith('Normal'):
  122. feat['benefit'] += '\n\n'
  123. feat['benefit'] += sanitext(t)
  124. else:
  125. print('>>> ' + t)
  126. extralist = [x for x in
  127. featpage.xpath('//div[@class="article-content"]/ul/li')]
  128. for extra in extralist:
  129. feat['benefit'] += '\n\n'
  130. feat['benefit'] += '- ' + sanitext(extra.text_content())
  131. if feat['benefit']:
  132. return feat
  133. else:
  134. return None
  135. def scrape_feats(baseurls):
  136. urls = []
  137. for baseurl in baseurls:
  138. urls += scrape_featlist(baseurl)
  139. feats = []
  140. for url in urls:
  141. feat = scrape_feat(url)
  142. if feat:
  143. feats.append(feat)
  144. time.sleep(.3)
  145. with open('feats.pickle', 'wb') as f:
  146. pickle.dump(feats, f)
  147. if __name__ == '__main__':
  148. scrape_feats(['http://www.d20pfsrd.com/feats/combat-feats/all-combat-feats',
  149. 'http://www.d20pfsrd.com/feats/armor-mastery-feats',
  150. 'http://www.d20pfsrd.com/feats/weapon-mastery-feats'])
  151. # /html/body/div/form/div[3]/div[2]/div[2]/div/table/tbody/tr[2]/td[1]
  152. # #ctl00_MainContent_GridView6 > tbody:nth-child(1) > tr:nth-child(2) > td:nth-child(1)
  153. # html body.dark div#wrapper.clearfix form#aspnetForm div#page.page.clearfix div#main-wrapper.main-wrapper div#main.main div table#ctl00_MainContent_GridView6 tbody tr td
  154. # /html/body/div[2]/div
  155. # body > div:nth-child(2) > div:nth-child(1)