Reading of number of samples working now in both JS and Python
This commit is contained in:
parent
221eeaf8f5
commit
d7f9873a35
2
.gitignore
vendored
2
.gitignore
vendored
@ -1,3 +1,3 @@
|
|||||||
*/__pycache__
|
*/__pycache__
|
||||||
*/search_index
|
*/search_index*
|
||||||
*/pages_index.json
|
*/pages_index.json
|
||||||
|
@ -13,7 +13,6 @@
|
|||||||
</p>
|
</p>
|
||||||
</form>
|
</form>
|
||||||
<div id="results"></div>
|
<div id="results"></div>
|
||||||
<script type="text/javascript" src="js/bloomfilter.js"></script>
|
|
||||||
<script type="text/javascript" src="js/bloom.js"></script>
|
<script type="text/javascript" src="js/bloom.js"></script>
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
|
@ -23,15 +23,21 @@ def remove_common_words(words):
|
|||||||
returned = [word for word in words if len(word) > 3]
|
returned = [word for word in words if len(word) > 3]
|
||||||
return returned
|
return returned
|
||||||
|
|
||||||
|
def padding_16(x):
|
||||||
def bitfield(n, fill):
|
if x < 256:
|
||||||
return [1 if digit=='1' else 0 for digit in bin(n)[2:].zfill(fill)]
|
return bytes([0,len(samples)])
|
||||||
|
else:
|
||||||
|
return bytes([int(x/256), x%256])
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
samples = list_directory("../samples/")
|
samples = list_directory("../samples/")
|
||||||
filters = {}
|
filters = {}
|
||||||
p = stemmer.PorterStemmer()
|
p = stemmer.PorterStemmer()
|
||||||
write = bitarray(bitfield(len(samples), 16))
|
write_little = bitarray(endian="little")
|
||||||
|
write_big = bitarray(endian="big")
|
||||||
|
|
||||||
|
write_little.frombytes(padding_16(len(samples)))
|
||||||
|
write_big.frombytes(padding_16(len(samples)))
|
||||||
|
|
||||||
if len(samples) > 65535:
|
if len(samples) > 65535:
|
||||||
sys.exit("[ERROR] Too many articles to index. You will have to change the "
|
sys.exit("[ERROR] Too many articles to index. You will have to change the "
|
||||||
@ -62,11 +68,17 @@ for sample in samples:
|
|||||||
"will have to change the way data is stored in the binary "
|
"will have to change the way data is stored in the binary "
|
||||||
"file to handle such amount of text.")
|
"file to handle such amount of text.")
|
||||||
|
|
||||||
write.extend(bitfield(filters[sample].bitarray.length(), 16))
|
#write_little.extend(bitfield(filters[sample].bitarray.length(), 16))
|
||||||
write.extend(filters[sample].bitarray)
|
#write_little.extend(filters[sample].bitarray)
|
||||||
|
#write_big.extend(bitfield(filters[sample].bitarray.length(), 16))
|
||||||
|
#write_big.extend(filters[sample].bitarray)
|
||||||
|
|
||||||
with open('../data/search_index', 'wb') as index_fh:
|
with open('../data/search_index_little', 'wb') as index_fh:
|
||||||
index_fh.write(write.tobytes())
|
print(write_little)
|
||||||
|
write_little.tofile(index_fh)
|
||||||
|
with open('../data/search_index_big', 'wb') as index_fh:
|
||||||
|
print(write_big)
|
||||||
|
write_big.tofile(index_fh)
|
||||||
|
|
||||||
with open('../data/pages_index.json', 'w') as pages_fh:
|
with open('../data/pages_index.json', 'w') as pages_fh:
|
||||||
pages_fh.write(json.dumps(samples))
|
pages_fh.write(json.dumps(samples))
|
||||||
|
35
js/bloom.js
35
js/bloom.js
@ -2,8 +2,19 @@ var loading = false;
|
|||||||
var usable = false;
|
var usable = false;
|
||||||
var search_index = false;
|
var search_index = false;
|
||||||
|
|
||||||
window.onload = function() {
|
// Check endianness to serve right file
|
||||||
};
|
function checkEndian(){
|
||||||
|
var a = new ArrayBuffer(4);
|
||||||
|
var b = new Uint8Array(a);
|
||||||
|
var c = new Uint32Array(a);
|
||||||
|
b[0] = 0xa1;
|
||||||
|
b[1] = 0xb2;
|
||||||
|
b[2] = 0xc3;
|
||||||
|
b[3] = 0xd4;
|
||||||
|
if(c[0] == 0xd4c3b2a1) return "little";
|
||||||
|
if(c[0] == 0xa1b2c3d4) return "big";
|
||||||
|
else return 0;
|
||||||
|
}
|
||||||
|
|
||||||
document.getElementById('search_form').addEventListener('submit', function(e) {
|
document.getElementById('search_form').addEventListener('submit', function(e) {
|
||||||
e.preventDefault();
|
e.preventDefault();
|
||||||
@ -19,18 +30,17 @@ document.getElementById('search').addEventListener('click', function() {
|
|||||||
document.getElementById("loading").innerHTML = "Loading index file...";
|
document.getElementById("loading").innerHTML = "Loading index file...";
|
||||||
|
|
||||||
var oReq = new XMLHttpRequest();
|
var oReq = new XMLHttpRequest();
|
||||||
oReq.open("GET", "/data/search_index", true);
|
oReq.open("GET", "data/search_index_"+checkEndian(), true);
|
||||||
oReq.responseType = "arraybuffer";
|
oReq.responseType = "arraybuffer";
|
||||||
|
|
||||||
oReq.onload = function (oEvent) {
|
oReq.onload = function (oEvent) {
|
||||||
var arrayBuffer = oReq.response; // Note: not oReq.responseText
|
var arrayBuffer = oReq.response; // Note: not oReq.responseText
|
||||||
if (arrayBuffer) {
|
|
||||||
loading = false;
|
|
||||||
usable = true;
|
|
||||||
document.getElementById("loading").innerHTML = "";
|
|
||||||
|
|
||||||
|
if (arrayBuffer) {
|
||||||
var tmp = new Uint8Array(arrayBuffer);
|
var tmp = new Uint8Array(arrayBuffer);
|
||||||
var nb_filters = 0;
|
var nb_filters = 0;
|
||||||
|
console.log(tmp);
|
||||||
|
return;
|
||||||
|
|
||||||
// First 16 bits == number of bitarrays
|
// First 16 bits == number of bitarrays
|
||||||
for (var i = 0; i < 16; i++) {
|
for (var i = 0; i < 16; i++) {
|
||||||
@ -55,6 +65,9 @@ document.getElementById('search').addEventListener('click', function() {
|
|||||||
|
|
||||||
offset += 16 + length;
|
offset += 16 + length;
|
||||||
}
|
}
|
||||||
|
document.getElementById("loading").innerHTML = "";
|
||||||
|
loading = false;
|
||||||
|
usable = true;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
document.getElementById("loading").innerHTML = "Error while loading search index.";
|
document.getElementById("loading").innerHTML = "Error while loading search index.";
|
||||||
@ -78,20 +91,20 @@ document.getElementById('search').addEventListener('click', function() {
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
function callback_change() {
|
/*function callback_change() {
|
||||||
if(!usable) {
|
if(!usable) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
var search = document.getElementById("search").value;
|
var search = document.getElementById("search").value;
|
||||||
document.getElementById("results").innerHTML = "<h2>Results :</h2>";
|
document.getElementById("results").innerHTML = "<h2>Results :</h2>";
|
||||||
/*TODO for(var key in index) {
|
//* for(var key in index) {
|
||||||
if(index[key].test(search)) {
|
if(index[key].test(search)) {
|
||||||
document.getElementById("results").innerHTML += "<p>"+key+"</p>";
|
document.getElementById("results").innerHTML += "<p>"+key+"</p>";
|
||||||
}
|
}
|
||||||
}*/
|
}* //
|
||||||
if(!document.querySelectorAll("#results p").length) {
|
if(!document.querySelectorAll("#results p").length) {
|
||||||
document.getElementById("results").innerHTML += "<p>No results...</p>";
|
document.getElementById("results").innerHTML += "<p>No results...</p>";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
document.getElementById("search").addEventListener('input', callback_change);
|
document.getElementById("search").addEventListener('input', callback_change);*/
|
||||||
|
@ -1,133 +0,0 @@
|
|||||||
(function(exports) {
|
|
||||||
exports.BloomFilter = BloomFilter;
|
|
||||||
exports.fnv_1a = fnv_1a;
|
|
||||||
exports.fnv_1a_b = fnv_1a_b;
|
|
||||||
|
|
||||||
var typedArrays = typeof ArrayBuffer !== "undefined";
|
|
||||||
|
|
||||||
// Creates a new bloom filter. If *m* is an array-like object, with a length
|
|
||||||
// property, then the bloom filter is loaded with data from the array, where
|
|
||||||
// each element is a 32-bit integer. Otherwise, *m* should specify the
|
|
||||||
// number of bits. *k* specifies the number of hashing functions.
|
|
||||||
function BloomFilter(m, k) {
|
|
||||||
var a;
|
|
||||||
if (typeof m !== "number") a = m, m = a.length * 32;
|
|
||||||
|
|
||||||
this.m = m;
|
|
||||||
this.k = k;
|
|
||||||
var n = Math.ceil(m / 32),
|
|
||||||
i = -1;
|
|
||||||
|
|
||||||
if (typedArrays) {
|
|
||||||
var kbytes = 1 << Math.ceil(Math.log(Math.ceil(Math.log(m) / Math.LN2 / 8)) / Math.LN2),
|
|
||||||
array = kbytes === 1 ? Uint8Array : kbytes === 2 ? Uint16Array : Uint32Array,
|
|
||||||
kbuffer = new ArrayBuffer(kbytes * k),
|
|
||||||
buckets = this.buckets = new Int32Array(n);
|
|
||||||
if (a) while (++i < n) buckets[i] = a[i];
|
|
||||||
this._locations = new array(kbuffer);
|
|
||||||
} else {
|
|
||||||
var buckets = this.buckets = [];
|
|
||||||
if (a) while (++i < n) buckets[i] = a[i];
|
|
||||||
else while (++i < n) buckets[i] = 0;
|
|
||||||
this._locations = [];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// See http://willwhim.wordpress.com/2011/09/03/producing-n-hash-functions-by-hashing-only-once/
|
|
||||||
BloomFilter.prototype.locations = function(v) {
|
|
||||||
var k = this.k,
|
|
||||||
m = this.m,
|
|
||||||
r = this._locations,
|
|
||||||
a = fnv_1a(v),
|
|
||||||
b = fnv_1a_b(a),
|
|
||||||
i = -1,
|
|
||||||
x = a % m;
|
|
||||||
while (++i < k) {
|
|
||||||
r[i] = x < 0 ? (x + m) : x;
|
|
||||||
x = (x + b) % m;
|
|
||||||
}
|
|
||||||
return r;
|
|
||||||
};
|
|
||||||
|
|
||||||
BloomFilter.prototype.add = function(v) {
|
|
||||||
var l = this.locations(v + ""),
|
|
||||||
i = -1,
|
|
||||||
k = this.k,
|
|
||||||
buckets = this.buckets;
|
|
||||||
while (++i < k) buckets[Math.floor(l[i] / 32)] |= 1 << (l[i] % 32);
|
|
||||||
};
|
|
||||||
|
|
||||||
BloomFilter.prototype.test = function(v) {
|
|
||||||
var l = this.locations(v + ""),
|
|
||||||
i = -1,
|
|
||||||
k = this.k,
|
|
||||||
b,
|
|
||||||
buckets = this.buckets;
|
|
||||||
while (++i < k) {
|
|
||||||
b = l[i];
|
|
||||||
if ((buckets[Math.floor(b / 32)] & (1 << (b % 32))) === 0) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
};
|
|
||||||
|
|
||||||
// Estimated cardinality.
|
|
||||||
BloomFilter.prototype.size = function() {
|
|
||||||
var buckets = this.buckets,
|
|
||||||
bits = 0;
|
|
||||||
for (var i = 0, n = buckets.length; i < n; ++i) bits += popcnt(buckets[i]);
|
|
||||||
return -this.m * Math.log(1 - bits / this.m) / this.k;
|
|
||||||
};
|
|
||||||
|
|
||||||
// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
|
|
||||||
function popcnt(v) {
|
|
||||||
v -= (v >> 1) & 0x55555555;
|
|
||||||
v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
|
|
||||||
return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Fowler/Noll/Vo hashing.
|
|
||||||
function fnv_1a(v) {
|
|
||||||
var n = v.length,
|
|
||||||
a = 2166136261,
|
|
||||||
c,
|
|
||||||
d,
|
|
||||||
i = -1;
|
|
||||||
while (++i < n) {
|
|
||||||
c = v.charCodeAt(i);
|
|
||||||
if (d = c & 0xff000000) {
|
|
||||||
a ^= d >> 24;
|
|
||||||
a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24);
|
|
||||||
}
|
|
||||||
if (d = c & 0xff0000) {
|
|
||||||
a ^= d >> 16;
|
|
||||||
a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24);
|
|
||||||
}
|
|
||||||
if (d = c & 0xff00) {
|
|
||||||
a ^= d >> 8;
|
|
||||||
a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24);
|
|
||||||
}
|
|
||||||
a ^= c & 0xff;
|
|
||||||
a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24);
|
|
||||||
}
|
|
||||||
// From http://home.comcast.net/~bretm/hash/6.html
|
|
||||||
a += a << 13;
|
|
||||||
a ^= a >> 7;
|
|
||||||
a += a << 3;
|
|
||||||
a ^= a >> 17;
|
|
||||||
a += a << 5;
|
|
||||||
return a & 0xffffffff;
|
|
||||||
}
|
|
||||||
|
|
||||||
// One additional iteration of FNV, given a hash.
|
|
||||||
function fnv_1a_b(a) {
|
|
||||||
a += (a << 1) + (a << 4) + (a << 7) + (a << 8) + (a << 24);
|
|
||||||
a += a << 13;
|
|
||||||
a ^= a >> 7;
|
|
||||||
a += a << 3;
|
|
||||||
a ^= a >> 17;
|
|
||||||
a += a << 5;
|
|
||||||
return a & 0xffffffff;
|
|
||||||
}
|
|
||||||
})(typeof exports !== "undefined" ? exports : this);
|
|
Loading…
x
Reference in New Issue
Block a user