scrapefeats.py 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160
  1. #!/usr/bin/env python3
  2. from lxml import html
  3. import requests
  4. import re
  5. import time
  6. import pprint
  7. import pickle
  8. def scrape_featlist(url):
  9. #r = requests.get(url)
  10. #featspage = html.fromstring(r.content)
  11. featspage = html.parse('/home/jmelesky/code/featscraper/assets/feats.html')
  12. urls = []
  13. for featanchor in featspage.xpath('//table[@id="ctl00_MainContent_GridView6"]/tr/td[1]/a'):
  14. urls.append(featanchor.attrib['href'])
  15. return urls
  16. def sanitext(text):
  17. rettext = re.sub(r'^:\s+', '', text)
  18. rettext = re.sub(r'\.$', '', rettext)
  19. rettext = re.sub(r'\r', '', rettext)
  20. rettext = re.sub(r'â\x80\x93', '-', rettext)
  21. rettext = re.sub(r'â\x80\x99', "'", rettext)
  22. rettext = re.sub(r'—', '\textemdash', rettext)
  23. rettext = re.sub(r'"', "''", rettext)
  24. rettext = re.sub(r'ACG', '', rettext)
  25. rettext = re.sub(r'APG', '', rettext)
  26. rettext = re.sub(r'ARG', '', rettext)
  27. rettext = re.sub(r'ISWG', '', rettext)
  28. rettext = re.sub(r'OA', '', rettext)
  29. rettext = re.sub(r'UC', '', rettext)
  30. rettext = re.sub(r'UI', '', rettext)
  31. rettext = re.sub(r'\(see the Pathfinder RPG Advanced Player\'s Guide\)', '', rettext)
  32. return rettext.strip()
  33. def addreq(reqset, t, req):
  34. if t in reqset:
  35. if not isinstance(reqset[t], list):
  36. reqset[t] = [reqset[t]]
  37. reqset[t].append(req)
  38. else:
  39. reqset[t] = [req]
  40. return reqset
  41. def parse_prereqs(reqtext, sep=r'[,;]'):
  42. reqs = {}
  43. for req in [sanitext(x).lower() for x in re.split(sep, reqtext)]:
  44. if req.find(' or ') > -1:
  45. reqs = addreq(reqs, 'or', parse_prereqs(req, sep=r'\Wor\W'))
  46. elif req.find(' and ') > -1:
  47. reqs = addreq(reqs, 'and', parse_prereqs(req, sep=r'\Wand\W'))
  48. else:
  49. m = re.search(r'^(\w{3}) (\d{1,2})\.?$', req)
  50. if m:
  51. reqs = addreq(reqs, m.group(1), m.group(2))
  52. else:
  53. m = re.search(r'^dexterity (\d+)', req)
  54. if m:
  55. reqs = addreq(reqs, 'dex', m.group(1))
  56. else:
  57. m = re.search(r'^strength (\d+)', req)
  58. if m:
  59. reqs = addreq(reqs, 'str', m.group(1))
  60. else:
  61. m = re.search(r'^base\W+attack\W+bonus\W+\+{0,1}(\d+)', req)
  62. if m:
  63. reqs = addreq(reqs, 'bab', m.group(1))
  64. else:
  65. m = re.search(r'^(.*?)\Wlevel (\d+)', req)
  66. if m:
  67. reqs = addreq(reqs, 'level', (m.group(1), m.group(2)))
  68. else:
  69. m = re.search(r'^(\d+).*level (.*)$', req)
  70. if m:
  71. reqs = addreq(reqs, 'level', (m.group(2), m.group(1)))
  72. else:
  73. m = re.search(r'^(.*?) (\d+) rank', req)
  74. if m:
  75. reqs = addreq(reqs, 'skill', (m.group(1), m.group(2)))
  76. else:
  77. reqs = addreq(reqs, 'feat', req)
  78. return reqs
  79. def scrape_feat(url):
  80. r = requests.get(url)
  81. featpage = html.fromstring(r.content)
  82. #featpage = html.parse('/home/jmelesky/code/featscraper/assets/feat-ald.html')
  83. feat = {'prereqs': {},
  84. 'benefit': '',
  85. 'special': '',
  86. 'trick': '',
  87. }
  88. feattexts = [x for x in
  89. featpage.xpath('//span[@id="ctl00_MainContent_DataListTypes_ctl00_LabelName"]')[0].itertext()]
  90. for i in range(0,len(feattexts)-1):
  91. t = feattexts[i]
  92. if i == 0:
  93. if t.find('Teamwork') < 0:
  94. feat['name'] = sanitext(re.sub(r'\(.*?\)', '', t))
  95. else:
  96. break
  97. elif t == 'Benefit' and feat['benefit'] == '':
  98. feat['benefit'] = sanitext(feattexts[i+1])
  99. elif t == 'Prerequisites' and feat['benefit'] == '':
  100. feat['prereqs'] = parse_prereqs(feattexts[i+1])
  101. elif t == 'Special' and feat['special'] == '':
  102. feat['special'] = sanitext(feattexts[i+1])
  103. elif t == 'Combat Trick':
  104. feat['trick'] = sanitext(feattexts[i+7])
  105. if feat['benefit']:
  106. return feat
  107. else:
  108. return None
  109. def scrape_feats(base, url):
  110. urls = scrape_featlist(base + url)
  111. feats = []
  112. for url in urls:
  113. print(url)
  114. feat = scrape_feat(base + url)
  115. if feat:
  116. feats.append(feat)
  117. time.sleep(.3)
  118. with open('feats.pickle', 'wb') as f:
  119. pickle.dump(feats, f)
  120. if __name__ == '__main__':
  121. scrape_feats('http://www.archivesofnethys.com/', 'Feats.aspx?Category=Combat')
  122. # /html/body/div/form/div[3]/div[2]/div[2]/div/table/tbody/tr[2]/td[1]
  123. # #ctl00_MainContent_GridView6 > tbody:nth-child(1) > tr:nth-child(2) > td:nth-child(1)
  124. # html body.dark div#wrapper.clearfix form#aspnetForm div#page.page.clearfix div#main-wrapper.main-wrapper div#main.main div table#ctl00_MainContent_GridView6 tbody tr td
  125. # /html/body/div[2]/div
  126. # body > div:nth-child(2) > div:nth-child(1)