|
@@ -4,20 +4,33 @@ extern crate select;
|
|
|
use select::document::Document;
|
|
|
use select::predicate::Name;
|
|
|
|
|
|
-
|
|
|
fn main() {
|
|
|
scrape_feats("http://www.archivesofnethys.com/Feats.aspx?Category=Combat");
|
|
|
}
|
|
|
|
|
|
+
|
|
|
fn scrape_feats(url: &str) {
|
|
|
+ // let bookstrs = [
|
|
|
+ // "ACG",
|
|
|
+ // "APG",
|
|
|
+ // "ARG",
|
|
|
+ // "ISWG",
|
|
|
+ // "OA",
|
|
|
+ // "UC",
|
|
|
+ // "UI",
|
|
|
+ // "(see the Pathfinder RPG Advanced Player's Guide)"
|
|
|
+ // ];
|
|
|
|
|
|
// commenting out -- going local for a bit
|
|
|
- //let resp = reqwest::get(url).unwrap();
|
|
|
- //assert!(resp.status().is_success());
|
|
|
+ let resp = reqwest::get(url).unwrap();
|
|
|
+ assert!(resp.status().is_success());
|
|
|
|
|
|
- //let doc = Document::from_read(resp).unwrap();
|
|
|
+ let doc = Document::from_read(resp).unwrap();
|
|
|
|
|
|
- let doc = Document::from(include_str!("/home/jmelesky/code/featscraper/assets/feats.html"));
|
|
|
+ //let doc = Document::from(include_str!("/home/jmelesky/code/featscraper/assets/feats.html"));
|
|
|
+
|
|
|
+ // compile this regex once, not once per node
|
|
|
+ let re_uppers = Regex::new(r"[A-Z]{2}").unwrap();
|
|
|
|
|
|
let mut header = true;
|
|
|
for node in doc.find(Name("tr")) {
|
|
@@ -26,17 +39,28 @@ fn scrape_feats(url: &str) {
|
|
|
header = false;
|
|
|
} else {
|
|
|
let mut tds = node.find(Name("td"));
|
|
|
- let name_raw = tds.next().unwrap().text();
|
|
|
- let prereqs_raw = tds.next().unwrap().text();
|
|
|
- let benefit_raw = tds.next().unwrap().text();
|
|
|
|
|
|
- // extra variables because '....unwrap().text().trim()' complains
|
|
|
+ let link = tds.next().unwrap() // link table cell
|
|
|
+ .find(Name("a")) // list of all links (just one)
|
|
|
+ .next().unwrap(); // the first link
|
|
|
+ let mut prereqs = tds.next().unwrap().text();
|
|
|
+
|
|
|
+ // parse out the link and name
|
|
|
+ let featurl = link.attr("href").unwrap();
|
|
|
+ let mut featname = link.text();
|
|
|
+
|
|
|
+
|
|
|
+ // extra code because '....unwrap().text().trim()' complains
|
|
|
// about lifetime of borrowd value
|
|
|
- let name = name_raw.trim();
|
|
|
- let prereqs = prereqs_raw.trim();
|
|
|
- let benefit = benefit_raw.trim();
|
|
|
+ let featname = featname.trim();
|
|
|
+ let prereqs = prereqs.trim();
|
|
|
+
|
|
|
|
|
|
- println!("|{}|{}|{}|", name, prereqs, benefit);
|
|
|
+ // print only the ones with two consecutive uppercase letters
|
|
|
+ // in the prereqs -- figure out what to trim
|
|
|
+ if re_uppers.is_match(prereqs) {
|
|
|
+ println!("|{}|{}|{}|", featname, prereqs, featurl);
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
|