First working, clearly not optimized, version
This commit is contained in:
parent
eaaa64bdea
commit
26d95b4cc3
49
bloom.js
Normal file
49
bloom.js
Normal file
@ -0,0 +1,49 @@
|
||||
var loading = false;
|
||||
var usable = false;
|
||||
var index = false;
|
||||
|
||||
$("form").submit(function(e) {
|
||||
e.preventDefault();
|
||||
});
|
||||
|
||||
$("#search").click(function() {
|
||||
if($(this).val() == "Search for articles...") {
|
||||
$(this).val("");
|
||||
}
|
||||
|
||||
if(index === false) {
|
||||
loading = true;
|
||||
$("#loading").text("Loading index file...");
|
||||
$.getJSON("index.json", function(data) {
|
||||
loading = false;
|
||||
usable = true;
|
||||
$("#loading").text("");
|
||||
index = new Array();
|
||||
|
||||
for(var key in data) {
|
||||
index[key] = new BloomFilter(32*256, 16);
|
||||
|
||||
for(var word_index in data[key]) {
|
||||
index[key].add(data[key][word_index]);
|
||||
}
|
||||
}
|
||||
|
||||
callback_change();
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
function callback_change() {
|
||||
if(!usable) {
|
||||
return;
|
||||
}
|
||||
var search = $("#search").val();
|
||||
$("#results").html("<h2>Results :</h2>");
|
||||
for(var key in index) {
|
||||
if(index[key].test(search)) {
|
||||
$("#results").append("<p>"+key+"</p>");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
$("#search").on('input', callback_change);
|
133
bloomfilter.js
Normal file
133
bloomfilter.js
Normal file
@ -0,0 +1,133 @@
|
||||
(function(exports) {
|
||||
exports.BloomFilter = BloomFilter;
|
||||
exports.fnv_1a = fnv_1a;
|
||||
exports.fnv_1a_b = fnv_1a_b;
|
||||
|
||||
var typedArrays = typeof ArrayBuffer !== "undefined";
|
||||
|
||||
// Creates a new bloom filter. If *m* is an array-like object, with a length
|
||||
// property, then the bloom filter is loaded with data from the array, where
|
||||
// each element is a 32-bit integer. Otherwise, *m* should specify the
|
||||
// number of bits. *k* specifies the number of hashing functions.
|
||||
function BloomFilter(m, k) {
|
||||
var a;
|
||||
if (typeof m !== "number") a = m, m = a.length * 32;
|
||||
|
||||
this.m = m;
|
||||
this.k = k;
|
||||
var n = Math.ceil(m / 32),
|
||||
i = -1;
|
||||
|
||||
if (typedArrays) {
|
||||
var kbytes = 1 << Math.ceil(Math.log(Math.ceil(Math.log(m) / Math.LN2 / 8)) / Math.LN2),
|
||||
array = kbytes === 1 ? Uint8Array : kbytes === 2 ? Uint16Array : Uint32Array,
|
||||
kbuffer = new ArrayBuffer(kbytes * k),
|
||||
buckets = this.buckets = new Int32Array(n);
|
||||
if (a) while (++i < n) buckets[i] = a[i];
|
||||
this._locations = new array(kbuffer);
|
||||
} else {
|
||||
var buckets = this.buckets = [];
|
||||
if (a) while (++i < n) buckets[i] = a[i];
|
||||
else while (++i < n) buckets[i] = 0;
|
||||
this._locations = [];
|
||||
}
|
||||
}
|
||||
|
||||
// See http://willwhim.wordpress.com/2011/09/03/producing-n-hash-functions-by-hashing-only-once/
|
||||
BloomFilter.prototype.locations = function(v) {
|
||||
var k = this.k,
|
||||
m = this.m,
|
||||
r = this._locations,
|
||||
a = fnv_1a(v),
|
||||
b = fnv_1a_b(a),
|
||||
i = -1,
|
||||
x = a % m;
|
||||
while (++i < k) {
|
||||
r[i] = x < 0 ? (x + m) : x;
|
||||
x = (x + b) % m;
|
||||
}
|
||||
return r;
|
||||
};
|
||||
|
||||
BloomFilter.prototype.add = function(v) {
|
||||
var l = this.locations(v + ""),
|
||||
i = -1,
|
||||
k = this.k,
|
||||
buckets = this.buckets;
|
||||
while (++i < k) buckets[Math.floor(l[i] / 32)] |= 1 << (l[i] % 32);
|
||||
};
|
||||
|
||||
BloomFilter.prototype.test = function(v) {
|
||||
var l = this.locations(v + ""),
|
||||
i = -1,
|
||||
k = this.k,
|
||||
b,
|
||||
buckets = this.buckets;
|
||||
while (++i < k) {
|
||||
b = l[i];
|
||||
if ((buckets[Math.floor(b / 32)] & (1 << (b % 32))) === 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
};
|
||||
|
||||
// Estimated cardinality.
|
||||
BloomFilter.prototype.size = function() {
|
||||
var buckets = this.buckets,
|
||||
bits = 0;
|
||||
for (var i = 0, n = buckets.length; i < n; ++i) bits += popcnt(buckets[i]);
|
||||
return -this.m * Math.log(1 - bits / this.m) / this.k;
|
||||
};
|
||||
|
||||
// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
|
||||
function popcnt(v) {
|
||||
v -= (v >> 1) & 0x55555555;
|
||||
v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
|
||||
return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
|
||||
}
|
||||
|
||||
// Fowler/Noll/Vo hashing.
|
||||
function fnv_1a(v) {
|
||||
var n = v.length,
|
||||
a = 2166136261,
|
||||
c,
|
||||
d,
|
||||
i = -1;
|
||||
while (++i < n) {
|
||||
c = v.charCodeAt(i);
|
||||
if (d = c & 0xff000000) {
|
||||
a ^= d >> 24;
|
||||
a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24);
|
||||
}
|
||||
if (d = c & 0xff0000) {
|
||||
a ^= d >> 16;
|
||||
a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24);
|
||||
}
|
||||
if (d = c & 0xff00) {
|
||||
a ^= d >> 8;
|
||||
a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24);
|
||||
}
|
||||
a ^= c & 0xff;
|
||||
a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24);
|
||||
}
|
||||
// From http://home.comcast.net/~bretm/hash/6.html
|
||||
a += a << 13;
|
||||
a ^= a >> 7;
|
||||
a += a << 3;
|
||||
a ^= a >> 17;
|
||||
a += a << 5;
|
||||
return a & 0xffffffff;
|
||||
}
|
||||
|
||||
// One additional iteration of FNV, given a hash.
|
||||
function fnv_1a_b(a) {
|
||||
a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24);
|
||||
a += a << 13;
|
||||
a ^= a >> 7;
|
||||
a += a << 3;
|
||||
a ^= a >> 17;
|
||||
a += a << 5;
|
||||
return a & 0xffffffff;
|
||||
}
|
||||
})(typeof exports !== "undefined" ? exports : this);
|
@ -3,6 +3,7 @@
|
||||
import os
|
||||
from lxml import html
|
||||
import re
|
||||
import json
|
||||
from collections import defaultdict
|
||||
|
||||
|
||||
@ -23,7 +24,6 @@ def remove_common_words(words):
|
||||
samples = list_directory("samples/")
|
||||
index = defaultdict(list)
|
||||
|
||||
i = 0
|
||||
for sample in samples:
|
||||
with open(sample, 'r') as sample_fh:
|
||||
content = sample_fh.read()
|
||||
@ -40,9 +40,7 @@ for sample in samples:
|
||||
# TODO : Could use http://tartarus.org/martin/PorterStemmer/
|
||||
|
||||
for word in words:
|
||||
index[word].append(i)
|
||||
index[sample].append(word)
|
||||
|
||||
i += 1
|
||||
|
||||
print(samples)
|
||||
print(index.items())
|
||||
with open("index.json", 'w') as index_fh:
|
||||
index_fh.write(json.dumps(index))
|
||||
|
20
index.html
Normal file
20
index.html
Normal file
@ -0,0 +1,20 @@
|
||||
<!doctype html>
|
||||
<html lang="fr">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title>BloomJS demo</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Bloom.JS demo</h1>
|
||||
<p id="loading"></p>
|
||||
<form>
|
||||
<p>
|
||||
<input type="text" id="search" name="search" value="Search for articles..."/>
|
||||
</p>
|
||||
</form>
|
||||
<div id="results"></div>
|
||||
<script type="text/javascript" src="jquery-2.0.3.min.js"></script>
|
||||
<script type="text/javascript" src="bloomfilter.js"></script>
|
||||
<script type="text/javascript" src="bloom.js"></script>
|
||||
</body>
|
||||
</html>
|
1
index.json
Normal file
1
index.json
Normal file
File diff suppressed because one or more lines are too long
6
jquery-2.0.3.min.js
vendored
Normal file
6
jquery-2.0.3.min.js
vendored
Normal file
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user