DrDub, Lighting Talk, OHM2013
Problem: classify an URL into 15 categories, such as "Sports", "Arts".
- For background information on the problem see http://wiki.duboue.net/index.php/URL_Classifier
Application: Felix Menard's Bookmark Organizer project
Data: Open Directory project (http://dmoz.org)
Approach: character n-grams and Naive Bayes classifier
Technology: node.js and Apparatus, part of NaturalNode
Main Category | Sub Category | URL |
Arts | Animation | http://shotani.www2.50megs.com/animen_uno.html |
Arts | Animation | http://valleyofazure.tripod.com/ |
Arts | Animation | http://www.angelfire.com/anime2/bestanimecharacters/ |
Arts | Animation | http://www.angelfire.com/anime2/ninisbishonen/ |
Arts | Animation | http://www.angelfire.com/grrl/magicshoppe2/ |
Arts | Animation | http://www.angelfire.com/nv/neko/ |
Arts | Animation | http://anime-alberta.org/ |
Arts | Animation | http://animeclub.org/ |
4,137,187 rows, 548 subcategories, 15 top level categories
function instanceToSparseObservation(url){ var these_features = {}; for(var i=0; i < url.length-NGRAM_SIZE; i++){ var feat = url.substr(i, NGRAM_SIZE); if(feat in features){ these_features[features[feat]] = 1; }else{ features[feat] = features.LENGTH; these_features[features.LENGTH] = 1; features.LENGTH++; } } return these_features; } function be_classy(url) { return classifier.getSparseClassifications(instanceToSparseObservation(url)); }
new lazy(fs.createReadStream('two_cats_urls.tsv')) .lines .forEach(function(line){ var parts = line.toString().split("\t"); var classy = parts[0]; //+"-"+parts[1]; var instance = { 'classy': classy, 'url' : parts[2] }; classifier.addSparseExample(instanceToSparseObservation(instance), instance.classy); }))
{"classFeatures":{"Arts":{"2":1.0001,"85":1.0001,"162":1.0001,"201":1.0001,"282":1.0001,"283":1.0001, "284":1.0001,"289":1.0001,"294":1.0001,"303":1.0001,"305":3.0000999999999998,"337":1.0001,"374":1.0001, "410":1.0001,"411":1.0001,"423":1.0001,"440":1.0001,"501":1.0001,"519":1.0001,"526":1.0001,"531":1.0001, "558":1.0001,"570":1.0001,"598":1.0001,"601":1.0001,"661":1.0001,"705":1.0001,"773":1.0001,"794":1.0001, "800":1.0001,"819":1.0001,"838":2.0000999999999998,"910":2.0000999999999998,"912":1.0001,"924":1.0001, "929":1.0001,"992":1.0001,"1018":1.0001,"1020":1.0001,"1027":3.0000999999999998,"1054":1.0001, "1064":1.0001,"1092":7.0001,"1167":1.0001,"1187":1.0001,"1193":1.0001,"1195":1.0001,"1202":2.0000999999999998, "1237":1.0001,"1254":1.0001,"1257":1.0001,"1271":2.0000999999999998,"1273":1.0001,"1294":1.0001, "1299":1.0001,"1306":1.0001,"1351":1.0001,"1364":2.0000999999999998,"1368":1.0001,"1375":1.0001, "1379":1.0001,"1388":1.0001,"1397":1.0001,"1398":1.0001,"1411":1.0001,"1428":1.0001," 1447":2.0000999999999998,"1453":1.0001,"1455":1.0001,"1457":1.0001,"1469":1.0001,"1489":1.0001, "1492":1.0001,"1514":1.0001,"1518":1.0001,"1533":1.0001, ...
var trained_features={"esta":5807,"stap":5808,"tapy":5809,"apyr":5810,"pyra":5811,"yral":5812,"ral/":5813, "al/s":5814,"l/sk":5815,"/sk.":5816,"sk.h":5817,"m/~r":5818,"/~ru":5819,"~ruk":5820,"ukaw":5821,"wa_k":5822, "a_ka":5823,"_kae":5824,"kaed":5825,"aede":5826,"ede/":5827,"de/i":5828,"m/~K":5829,"/~Ka":5830,"~Kat":5831, "Kati":5832,"ati5":5833,"ti5_":5834,"i5_D":5835,"5_D/":5836,"_D/Z":5837,"D/Ze":5838,"/Zel":5839,"Zel.":5840, "m/~T":5841,"/~Th":5842,"~The":5843,"The_":5844,"he_S":5845,"e_Sl":5846,"_Sla":5847,"Slay":5848,"laye":5849, "ayer":5850,"yers":5851,"m/so":5852,"/sol":5853,"solb":5854,"olbi":5855,"lbia":5856,"bian":5857,"ianc":5858, "anca":5859,"nca/":5860,"ca/a":5861,"a/an":5862,"r-so":5863,"-sol":5864,"nca2":5865,"ca2.":5866,"a2.h":5867,
Contributed back to Apparatus as part of the 24pullrequests dare.
9:23 AM - 2 Dec 12
@chrisumbel
@pabloduboue sweet, you're the santa clause of additive smoothing!
The current demo (in example/) is trained in top level class and 10%. Unseen performance is 54% accurary but that includes lots of repeated URLs. Actual performance seems to be at 30% at most.
Something interesting to apply ML in JS during the camp?
Let's talk!
Machine Learning in Javascript | 1 |
---|---|
URLClassy | 2 |
Data | 3 |
How To Use It | 4 |
Training | 5 |
Trained Model | 6 |
Sparse Observations | 7 |
DEMO | 8 |
Have an idea? | 9 |
Table of Contents | t |
---|---|
Exposé | ESC |
Full screen slides | e |
Presenter View | p |
Source Files | s |
Slide Numbers | n |
Toggle screen blanking | b |
Show/hide slide context | c |
Notes | 2 |
Help | h |