Browse Source

tried and failed to incorporate removal of book reference bits. now grabbing the feat url, though.

jmelesky 7 years ago
parent
commit
491418d45b
1 changed files with 37 additions and 13 deletions
  1. 37 13
      src/main.rs

+ 37 - 13
src/main.rs

@@ -4,20 +4,33 @@ extern crate select;
 use select::document::Document;
 use select::predicate::Name;
 
-
 fn main() {
     scrape_feats("http://www.archivesofnethys.com/Feats.aspx?Category=Combat");
 }
 
+
 fn scrape_feats(url: &str) {
+    // let bookstrs = [
+    //     "ACG",
+    //     "APG",
+    //     "ARG",
+    //     "ISWG",
+    //     "OA",
+    //     "UC",
+    //     "UI",
+    //     "(see the Pathfinder RPG Advanced Player's Guide)"
+    // ];
 
     // commenting out -- going local for a bit
-    //let resp = reqwest::get(url).unwrap();
-    //assert!(resp.status().is_success());
+    let resp = reqwest::get(url).unwrap();
+    assert!(resp.status().is_success());
 
-    //let doc = Document::from_read(resp).unwrap();
+    let doc = Document::from_read(resp).unwrap();
 
-    let doc = Document::from(include_str!("/home/jmelesky/code/featscraper/assets/feats.html"));
+    //let doc = Document::from(include_str!("/home/jmelesky/code/featscraper/assets/feats.html"));
+
+    // compile this regex once, not once per node
+    let re_uppers = Regex::new(r"[A-Z]{2}").unwrap();
 
     let mut header = true;
     for node in doc.find(Name("tr")) {
@@ -26,17 +39,28 @@ fn scrape_feats(url: &str) {
             header = false;
         } else {
             let mut tds = node.find(Name("td"));
-            let name_raw = tds.next().unwrap().text();
-            let prereqs_raw = tds.next().unwrap().text();
-            let benefit_raw = tds.next().unwrap().text();
 
-            // extra variables because '....unwrap().text().trim()' complains
+            let link = tds.next().unwrap() // link table cell
+                .find(Name("a")) // list of all links (just one)
+                .next().unwrap(); // the first link
+            let mut prereqs = tds.next().unwrap().text();
+
+            // parse out the link and name
+            let featurl = link.attr("href").unwrap();
+            let mut featname = link.text();
+
+
+            // extra code because '....unwrap().text().trim()' complains
             // about lifetime of borrowd value
-            let name = name_raw.trim();
-            let prereqs = prereqs_raw.trim();
-            let benefit = benefit_raw.trim();
+            let featname = featname.trim();
+            let prereqs = prereqs.trim();
+
 
-            println!("|{}|{}|{}|", name, prereqs, benefit);
+            // print only the ones with two consecutive uppercase letters
+            // in the prereqs -- figure out what to trim
+            if re_uppers.is_match(prereqs) {
+                println!("|{}|{}|{}|", featname, prereqs, featurl);
+            }
         }
     }