Machine Learning in Javascript

Not Unlike Scubadiving in Tuxedos

DrDub, Lighting Talk, OHM2013

Presenter Notes

URLClassy

Problem: classify an URL into 15 categories, such as "Sports", "Arts".
- For background information on the problem see http://wiki.duboue.net/index.php/URL_Classifier
Application: Felix Menard's Bookmark Organizer project
Data: Open Directory project (http://dmoz.org)
Approach: character n-grams and Naive Bayes classifier
Technology: node.js and Apparatus, part of NaturalNode

Presenter Notes

Data

Main Category	Sub Category	URL
Arts	Animation	http://shotani.www2.50megs.com/animen_uno.html
Arts	Animation	http://valleyofazure.tripod.com/
Arts	Animation	http://www.angelfire.com/anime2/bestanimecharacters/
Arts	Animation	http://www.angelfire.com/anime2/ninisbishonen/
Arts	Animation	http://www.angelfire.com/grrl/magicshoppe2/
Arts	Animation	http://www.angelfire.com/nv/neko/
Arts	Animation	http://anime-alberta.org/
Arts	Animation	http://animeclub.org/

4,137,187 rows, 548 subcategories, 15 top level categories

Presenter Notes

How To Use It

function instanceToSparseObservation(url){
    var these_features = {};
    for(var i=0; i < url.length-NGRAM_SIZE; i++){
    var feat = url.substr(i, NGRAM_SIZE);
    if(feat in features){
        these_features[features[feat]] = 1;
    }else{
        features[feat] = features.LENGTH;
        these_features[features.LENGTH] = 1;
        features.LENGTH++;
    }
    }

    return these_features;
}

function be_classy(url) {
    return classifier.getSparseClassifications(instanceToSparseObservation(url));
}

Presenter Notes

Training

new lazy(fs.createReadStream('two_cats_urls.tsv'))
    .lines
    .forEach(function(line){
    var parts = line.toString().split("\t");
        var classy = parts[0]; //+"-"+parts[1];
        var instance = { 'classy': classy,
                         'url' : parts[2] };
        classifier.addSparseExample(instanceToSparseObservation(instance), 
                        instance.classy);
     }))

Presenter Notes

Trained Model

classifier.js (18Mib)

{"classFeatures":{"Arts":{"2":1.0001,"85":1.0001,"162":1.0001,"201":1.0001,"282":1.0001,"283":1.0001,
"284":1.0001,"289":1.0001,"294":1.0001,"303":1.0001,"305":3.0000999999999998,"337":1.0001,"374":1.0001,
"410":1.0001,"411":1.0001,"423":1.0001,"440":1.0001,"501":1.0001,"519":1.0001,"526":1.0001,"531":1.0001,
"558":1.0001,"570":1.0001,"598":1.0001,"601":1.0001,"661":1.0001,"705":1.0001,"773":1.0001,"794":1.0001,
"800":1.0001,"819":1.0001,"838":2.0000999999999998,"910":2.0000999999999998,"912":1.0001,"924":1.0001,
"929":1.0001,"992":1.0001,"1018":1.0001,"1020":1.0001,"1027":3.0000999999999998,"1054":1.0001,
"1064":1.0001,"1092":7.0001,"1167":1.0001,"1187":1.0001,"1193":1.0001,"1195":1.0001,"1202":2.0000999999999998,
"1237":1.0001,"1254":1.0001,"1257":1.0001,"1271":2.0000999999999998,"1273":1.0001,"1294":1.0001,
"1299":1.0001,"1306":1.0001,"1351":1.0001,"1364":2.0000999999999998,"1368":1.0001,"1375":1.0001,
"1379":1.0001,"1388":1.0001,"1397":1.0001,"1398":1.0001,"1411":1.0001,"1428":1.0001,"
1447":2.0000999999999998,"1453":1.0001,"1455":1.0001,"1457":1.0001,"1469":1.0001,"1489":1.0001,
"1492":1.0001,"1514":1.0001,"1518":1.0001,"1533":1.0001, ...

features.js (~6Mib)

var trained_features={"esta":5807,"stap":5808,"tapy":5809,"apyr":5810,"pyra":5811,"yral":5812,"ral/":5813,
"al/s":5814,"l/sk":5815,"/sk.":5816,"sk.h":5817,"m/~r":5818,"/~ru":5819,"~ruk":5820,"ukaw":5821,"wa_k":5822,
"a_ka":5823,"_kae":5824,"kaed":5825,"aede":5826,"ede/":5827,"de/i":5828,"m/~K":5829,"/~Ka":5830,"~Kat":5831,
"Kati":5832,"ati5":5833,"ti5_":5834,"i5_D":5835,"5_D/":5836,"_D/Z":5837,"D/Ze":5838,"/Zel":5839,"Zel.":5840,
"m/~T":5841,"/~Th":5842,"~The":5843,"The_":5844,"he_S":5845,"e_Sl":5846,"_Sla":5847,"Slay":5848,"laye":5849,
"ayer":5850,"yers":5851,"m/so":5852,"/sol":5853,"solb":5854,"olbi":5855,"lbia":5856,"bian":5857,"ianc":5858,
"anca":5859,"nca/":5860,"ca/a":5861,"a/an":5862,"r-so":5863,"-sol":5864,"nca2":5865,"ca2.":5866,"a2.h":5867,

Presenter Notes

Sparse Observations

Contributed back to Apparatus as part of the 24pullrequests dare.

9:23 AM - 2 Dec 12
‏@chrisumbel
@pabloduboue sweet, you're the santa clause of additive smoothing!

Presenter Notes

DEMO

The current demo (in example/) is trained in top level class and 10%. Unseen performance is 54% accurary but that includes lots of repeated URLs. Actual performance seems to be at 30% at most.

http://drdub.github.com/urlclassy/example/

Presenter Notes

Have an idea?

Something interesting to apply ML in JS during the camp?

Let's talk!

Table of Contents	t
Exposé	ESC
Full screen slides	e
Presenter View	p
Source Files	s
Slide Numbers	n
Toggle screen blanking	b
Show/hide slide context	c
Notes	2
Help	h

Not Unlike Scubadiving in Tuxedos

Table of Contents

Help