jmelesky
/
pdorg_site


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310
							<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML//EN">
<html> <head>
<title>Write Your Own Bayesian Classifier!</title>

<style>

.slide {
  border: 2px solid #006600;
  background-color: #CCFFCC;
  position: absolute;
  padding: 5%;
  width: 85%;
  height: 80%;
}


</style>

<script src="scripts/jquery-1.2.3.js" type="text/javascript" />
<script src="scripts/slideshow.js" type="text/javascript" />
</head>

<body>


<div class='slide'>
<h1>Write Your Own Bayesian Classifier!</h1>
<p>(LPW '07) (john melesky)</p>
<p>This is an hour talk, squeezed into 30 minutes, so let's get going.</p>
</div>

<div class='slide fade'>
<h1>Write Your Own Bayesian Classifier!</h1>
<p>(LPW '07) (john melesky)</p>
<p>This is an hour talk, squeezed into 30 minutes, so let's get going.</p>
<p>(oh, i'm available for freelance work in machine learning/natural language processing)</p>
</div>

<div class='slide'>
<h1>Wait, wait</h1>
<em>Why</em> the hell should i <del>reinvent the wheel</del> write my own Bayesian classifier?
</div>

<div class='slide'>
<h1>Opening joke</h1>
<p>Q: How many Londoners does it take to change a lightbulb?</p>
</div>

<div class='slide'>
<h1>Opening joke</h1>
<p>Q: How many Londoners does it take to change a lightbulb?</p>
<p>A: 127</p>
</div>

<div class='slide'>
<h1>Really?</h1>
<p>If this is true, that means that 8.415 million Londoners (99%!) are changing lightbulbs <em>right now</em>.</p>
</div>

<div class='slide'>
<h1>Really?</h1>
<p>If this is true, that means that 8.415 million Londoners (99%!) are changing lightbulbs <em>right now</em>.</p>
<p>By contrast, only 1% of the rest of the world are currently changing lightbulbs.</p>
</div>

<div class='slide'>
<h1>Question</h1>
<p>If you're changing a lightbulb right now, what's the likelihood you're a Londoner?</p>
</div>

<div class='slide'>
<h1>Question</h1>
<p>If you're changing a lightbulb right now, what's the likelihood you're a Londoner?</p>
<p>(hint: the answer is <em>not</em> 99%)</p>
</div>

<div class='slide'>
<h1>Bayes' Theorem</h1>
<p></p>
<p>If you look it up on Wikipedia, you'll see something like this.</p>
</div>

<div class='slide'>
<h1>Bayes' Theorem</h1>
<img style="float: right;" src="bayes.png" />
<p>Translated, roughly:
<ul>
  <li>P == Probability</li>
  <li>A == "is a Londoner"</li>
  <li>B == "is changing a lightbulb"</li>
  <li>x|y == "x, given y"</li>
</ul></p>
</div>

<div class='slide'>
<h1>Easy to write:</h1>
<pre>
sub bayes {
  my ($p_a, $p_b, $p_b_a) = @_;
  
  my $p_a_b = ($p_b_a * $p_a) / $p_b;
  
  return $p_a_b;
}
</pre>
</div>

<div class='slide'>
<h1>Right, that's the theory</h1>
</div>

<div class='slide'>
<h1>To make a classifier...</h1>
<ol>
  <li>Tokenize your training set</li>
  <li>Build your model</li>
  <li>Test it</li>
</ol>
</div>

<div class='slide'>
<h1>Tokenize your training set</h1>
<pre>
sub tokenize {
  my $contents = shift;
  
  my %tokens = map { $_ => 1 } split(/\s+/, $contents);
  return %tokens;
}
</pre>
</div>

<div class='slide'>
<h1>Build your model</h1>
<pre>
my %work_tokens = ();
my %notwork_tokens = ();
  
foreach my $file (@work_files) {
  my %tokens = tokenize_file("training_set/" . $file);
  %work_tokens = combine_hash(\%work_tokens, \%tokens);
}
  
foreach my $file (@notwork_files) {
  my %tokens = tokenize_file("training_set/" . $file);
  %notwork_tokens = combine_hash(\%notwork_tokens, \%tokens);
}
  
my %total_tokens = combine_hash(\%work_tokens, 
                   \%notwork_tokens);
</pre>
</div>

<div class='slide'>
<h1>Build your model</h1>
<pre>
sub combine_hash {
  my ($hash1, $hash2) = @_;
  
  my %resulthash = %{ $hash1 };
  
  foreach my $key (keys(%{ $hash2 })) {
    if ($resulthash{$key}) {
      $resulthash{$key} += $hash2->{$key};
    } else {
      $resulthash{$key} = $hash2->{$key};
    }
  }
  
  return %resulthash;
}
</pre>
</div>

<div class='slide'>
<h1>Build your model</h1>
<pre>
sub tokenize_file {
  my $filename = shift;
  
  my $contents = '';
  open(FILE, $filename);
  read(FILE, $contents, -s FILE);
  close(FILE);
  
  return tokenize($contents);
}
</pre>
</div>

<div class='slide'>
<h1>Build your model</h1>
<pre>
my $total_work_files = scalar(@work_files);
my $total_notwork_files = scalar(@notwork_files);
my $total_files = $total_work_files + $total_notwork_files;
my $probability_work = $total_work_files / $total_files;
my $probability_notwork = $total_notwork_files / $total_files;
</pre>
</div>

<div class='slide'>
<h1>Test it</h1>
<img style="float: right;" src="bayes.png" />
<p>Wait a minute ...</p>
</div>

<div class='slide'>
<h1>Test it</h1>
<img style="float: right;" src="bayes.png" />
<p>Wait a minute ...</p>
<p>What is P(B|A), when you have more than one B?</p>
</div>

<div class='slide'>
<h1>Test it</h1>
<img style="float: right;" src="bayes.png" />
<p>Wait a minute ...</p>
<p>What is P(B|A), when you have more than one B?</p>
<p>For that matter, what is P(B), when you have more than one B?</p>
</div>

<div class='slide'>
<h1>P(B|A)</h1>
<p>P(B<sub>1</sub>|A) P(B<sub>2</sub>|A) ... P(B<sub><i>n</i></sub>|A)</p>
</div>

<div class='slide'>
<h1>P(B)</h1>
<p>Let's, um, ignore that for now.</p>
</div>

<div class='slide'>
<h1>P(B)</h1>
<p>Let's, um, ignore that for now.</p>
<p>Trust me, it will work out.</p>
</div>

<div class='slide'>
<h1>Test it</h1>
<pre>
my %total_tokens = combine_hash(\%work_tokens, \%notwork_tokens);
  
my $work_accumulator = 1;
my $notwork_accumulator = 1;
my $total_tokens = scalar(keys(%test_tokens));
  
foreach my $token (keys(%test_tokens)) {
  if (exists($total_tokens{$token})) {
    my $p_t_w = (($work_tokens{$token} || 0) + 1)
                 / ($total_work_files + $total_tokens);
    $work_accumulator = $work_accumulator * $p_t_w;
  
    my $p_t_nw = (($notwork_tokens{$token} || 0) + 1)
                  / ($total_notwork_files + $total_tokens);
    $notwork_accumulator = $notwork_accumulator * $p_t_nw;
  }
}
</pre>
</div>

<div class='slide'>
<h1>Test it</h1>
<pre>
my $score_work = bayes( $probability_work,
                        $total_tokens,
                        $work_accumulator);
  
my $score_notwork = bayes( $probability_notwork,
                           $total_tokens,
                           $notwork_accumulator);
  
my $likelihood_work = $score_work / ($score_work + $score_notwork);
my $likelihood_notwork = $score_notwork / ($score_work + $score_notwork);
  
printf("likelihood of work email: %0.2f %%\n",
       ($likelihood_work * 100));
printf("likelihood of notwork email: %0.2f %%\n",
       ($likelihood_notwork * 100));
</pre>
</div>

<div class='slide'>
<h1>And, we're done!</h1>
</div>

<div class='slide'>
<h1>Possible improvements</h1>
<ul>
  <li>the tokenizer</li>
  <li>feature selection</li>
  <li>feature decoration</li>
</ul>
</div>

<div class='slide'>
<h1>Gotchas</h1>
</div>

<div class='slide'>
<h1>Questions?</h1>
</div>

<div class='slide'>
<h1>Thanks, kindly</h1>
</div>

</body></html>