lwp.html 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310
  1. <!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML//EN">
  2. <html> <head>
  3. <title>Write Your Own Bayesian Classifier!</title>
  4. <style>
  5. .slide {
  6. border: 2px solid #006600;
  7. background-color: #CCFFCC;
  8. position: absolute;
  9. padding: 5%;
  10. width: 85%;
  11. height: 80%;
  12. }
  13. </style>
  14. <script src="scripts/jquery-1.2.3.js" type="text/javascript" />
  15. <script src="scripts/slideshow.js" type="text/javascript" />
  16. </head>
  17. <body>
  18. <div class='slide'>
  19. <h1>Write Your Own Bayesian Classifier!</h1>
  20. <p>(LPW '07) (john melesky)</p>
  21. <p>This is an hour talk, squeezed into 30 minutes, so let's get going.</p>
  22. </div>
  23. <div class='slide fade'>
  24. <h1>Write Your Own Bayesian Classifier!</h1>
  25. <p>(LPW '07) (john melesky)</p>
  26. <p>This is an hour talk, squeezed into 30 minutes, so let's get going.</p>
  27. <p>(oh, i'm available for freelance work in machine learning/natural language processing)</p>
  28. </div>
  29. <div class='slide'>
  30. <h1>Wait, wait</h1>
  31. <em>Why</em> the hell should i <del>reinvent the wheel</del> write my own Bayesian classifier?
  32. </div>
  33. <div class='slide'>
  34. <h1>Opening joke</h1>
  35. <p>Q: How many Londoners does it take to change a lightbulb?</p>
  36. </div>
  37. <div class='slide'>
  38. <h1>Opening joke</h1>
  39. <p>Q: How many Londoners does it take to change a lightbulb?</p>
  40. <p>A: 127</p>
  41. </div>
  42. <div class='slide'>
  43. <h1>Really?</h1>
  44. <p>If this is true, that means that 8.415 million Londoners (99%!) are changing lightbulbs <em>right now</em>.</p>
  45. </div>
  46. <div class='slide'>
  47. <h1>Really?</h1>
  48. <p>If this is true, that means that 8.415 million Londoners (99%!) are changing lightbulbs <em>right now</em>.</p>
  49. <p>By contrast, only 1% of the rest of the world are currently changing lightbulbs.</p>
  50. </div>
  51. <div class='slide'>
  52. <h1>Question</h1>
  53. <p>If you're changing a lightbulb right now, what's the likelihood you're a Londoner?</p>
  54. </div>
  55. <div class='slide'>
  56. <h1>Question</h1>
  57. <p>If you're changing a lightbulb right now, what's the likelihood you're a Londoner?</p>
  58. <p>(hint: the answer is <em>not</em> 99%)</p>
  59. </div>
  60. <div class='slide'>
  61. <h1>Bayes' Theorem</h1>
  62. <p></p>
  63. <p>If you look it up on Wikipedia, you'll see something like this.</p>
  64. </div>
  65. <div class='slide'>
  66. <h1>Bayes' Theorem</h1>
  67. <img style="float: right;" src="bayes.png" />
  68. <p>Translated, roughly:
  69. <ul>
  70. <li>P == Probability</li>
  71. <li>A == "is a Londoner"</li>
  72. <li>B == "is changing a lightbulb"</li>
  73. <li>x|y == "x, given y"</li>
  74. </ul></p>
  75. </div>
  76. <div class='slide'>
  77. <h1>Easy to write:</h1>
  78. <pre>
  79. sub bayes {
  80. my ($p_a, $p_b, $p_b_a) = @_;
  81. my $p_a_b = ($p_b_a * $p_a) / $p_b;
  82. return $p_a_b;
  83. }
  84. </pre>
  85. </div>
  86. <div class='slide'>
  87. <h1>Right, that's the theory</h1>
  88. </div>
  89. <div class='slide'>
  90. <h1>To make a classifier...</h1>
  91. <ol>
  92. <li>Tokenize your training set</li>
  93. <li>Build your model</li>
  94. <li>Test it</li>
  95. </ol>
  96. </div>
  97. <div class='slide'>
  98. <h1>Tokenize your training set</h1>
  99. <pre>
  100. sub tokenize {
  101. my $contents = shift;
  102. my %tokens = map { $_ => 1 } split(/\s+/, $contents);
  103. return %tokens;
  104. }
  105. </pre>
  106. </div>
  107. <div class='slide'>
  108. <h1>Build your model</h1>
  109. <pre>
  110. my %work_tokens = ();
  111. my %notwork_tokens = ();
  112. foreach my $file (@work_files) {
  113. my %tokens = tokenize_file("training_set/" . $file);
  114. %work_tokens = combine_hash(\%work_tokens, \%tokens);
  115. }
  116. foreach my $file (@notwork_files) {
  117. my %tokens = tokenize_file("training_set/" . $file);
  118. %notwork_tokens = combine_hash(\%notwork_tokens, \%tokens);
  119. }
  120. my %total_tokens = combine_hash(\%work_tokens,
  121. \%notwork_tokens);
  122. </pre>
  123. </div>
  124. <div class='slide'>
  125. <h1>Build your model</h1>
  126. <pre>
  127. sub combine_hash {
  128. my ($hash1, $hash2) = @_;
  129. my %resulthash = %{ $hash1 };
  130. foreach my $key (keys(%{ $hash2 })) {
  131. if ($resulthash{$key}) {
  132. $resulthash{$key} += $hash2->{$key};
  133. } else {
  134. $resulthash{$key} = $hash2->{$key};
  135. }
  136. }
  137. return %resulthash;
  138. }
  139. </pre>
  140. </div>
  141. <div class='slide'>
  142. <h1>Build your model</h1>
  143. <pre>
  144. sub tokenize_file {
  145. my $filename = shift;
  146. my $contents = '';
  147. open(FILE, $filename);
  148. read(FILE, $contents, -s FILE);
  149. close(FILE);
  150. return tokenize($contents);
  151. }
  152. </pre>
  153. </div>
  154. <div class='slide'>
  155. <h1>Build your model</h1>
  156. <pre>
  157. my $total_work_files = scalar(@work_files);
  158. my $total_notwork_files = scalar(@notwork_files);
  159. my $total_files = $total_work_files + $total_notwork_files;
  160. my $probability_work = $total_work_files / $total_files;
  161. my $probability_notwork = $total_notwork_files / $total_files;
  162. </pre>
  163. </div>
  164. <div class='slide'>
  165. <h1>Test it</h1>
  166. <img style="float: right;" src="bayes.png" />
  167. <p>Wait a minute ...</p>
  168. </div>
  169. <div class='slide'>
  170. <h1>Test it</h1>
  171. <img style="float: right;" src="bayes.png" />
  172. <p>Wait a minute ...</p>
  173. <p>What is P(B|A), when you have more than one B?</p>
  174. </div>
  175. <div class='slide'>
  176. <h1>Test it</h1>
  177. <img style="float: right;" src="bayes.png" />
  178. <p>Wait a minute ...</p>
  179. <p>What is P(B|A), when you have more than one B?</p>
  180. <p>For that matter, what is P(B), when you have more than one B?</p>
  181. </div>
  182. <div class='slide'>
  183. <h1>P(B|A)</h1>
  184. <p>P(B<sub>1</sub>|A) P(B<sub>2</sub>|A) ... P(B<sub><i>n</i></sub>|A)</p>
  185. </div>
  186. <div class='slide'>
  187. <h1>P(B)</h1>
  188. <p>Let's, um, ignore that for now.</p>
  189. </div>
  190. <div class='slide'>
  191. <h1>P(B)</h1>
  192. <p>Let's, um, ignore that for now.</p>
  193. <p>Trust me, it will work out.</p>
  194. </div>
  195. <div class='slide'>
  196. <h1>Test it</h1>
  197. <pre>
  198. my %total_tokens = combine_hash(\%work_tokens, \%notwork_tokens);
  199. my $work_accumulator = 1;
  200. my $notwork_accumulator = 1;
  201. my $total_tokens = scalar(keys(%test_tokens));
  202. foreach my $token (keys(%test_tokens)) {
  203. if (exists($total_tokens{$token})) {
  204. my $p_t_w = (($work_tokens{$token} || 0) + 1)
  205. / ($total_work_files + $total_tokens);
  206. $work_accumulator = $work_accumulator * $p_t_w;
  207. my $p_t_nw = (($notwork_tokens{$token} || 0) + 1)
  208. / ($total_notwork_files + $total_tokens);
  209. $notwork_accumulator = $notwork_accumulator * $p_t_nw;
  210. }
  211. }
  212. </pre>
  213. </div>
  214. <div class='slide'>
  215. <h1>Test it</h1>
  216. <pre>
  217. my $score_work = bayes( $probability_work,
  218. $total_tokens,
  219. $work_accumulator);
  220. my $score_notwork = bayes( $probability_notwork,
  221. $total_tokens,
  222. $notwork_accumulator);
  223. my $likelihood_work = $score_work / ($score_work + $score_notwork);
  224. my $likelihood_notwork = $score_notwork / ($score_work + $score_notwork);
  225. printf("likelihood of work email: %0.2f %%\n",
  226. ($likelihood_work * 100));
  227. printf("likelihood of notwork email: %0.2f %%\n",
  228. ($likelihood_notwork * 100));
  229. </pre>
  230. </div>
  231. <div class='slide'>
  232. <h1>And, we're done!</h1>
  233. </div>
  234. <div class='slide'>
  235. <h1>Possible improvements</h1>
  236. <ul>
  237. <li>the tokenizer</li>
  238. <li>feature selection</li>
  239. <li>feature decoration</li>
  240. </ul>
  241. </div>
  242. <div class='slide'>
  243. <h1>Gotchas</h1>
  244. </div>
  245. <div class='slide'>
  246. <h1>Questions?</h1>
  247. </div>
  248. <div class='slide'>
  249. <h1>Thanks, kindly</h1>
  250. </div>
  251. </body></html>