
java simhash是什么?讓我們一起來了解一下吧!
java simhash是java程序中的一種算法。Simhash算法產生與2002年,設計非常美妙,它輸入是一個向量,得出的結果是一個F位的簽名值。

Simhash和一般的hash算法不同,它具有兩個關鍵的特點:
1.一個文檔的指紋是所有屬性的某種hash;
2.相似文檔的hash應該是相似的;
?simhash 算法如下:1,將一個 f 維的向量 V 初始化為 0 ; f 位的二進制數 S 初始化為 0 ;2,對每一個特征:用傳統的 hash 算法對該特征產生一個 f 位的簽名 b 。對 i=1 到 f :如果b 的第 i 位為 1 ,則 V 的第 i 個元素加上該特征的權重;否則,V 的第 i 個元素減去該特征的權重。?3,如果 V 的第 i 個元素大于 0 ,則 S 的第 i 位為 1 ,否則為 0 ;4,輸出 S 作為簽名。
simhash 算法代碼:
package?com.xxxx.checkandbigdataquery.utils;
?
import?it.unimi.dsi.fastutil.longs.LongOpenHashSet;
import?it.unimi.dsi.fastutil.longs.LongSet;
import?java.io.File;
import?java.io.FileInputStream;
import?java.io.IOException;
import?java.nio.CharBuffer;
import?java.util.Set;
?
?
/**
?*?a?basic?SimHash?implementation
?*
?*
?*/
public?class?SimHash?{
??public?static?final?int??HASH_SIZE??????????=?64;
??public?static?final?long?HASH_RANGE?????????=?2?^?HASH_SIZE;
??public?static?MurmurHash?hasher?????????????=?new?MurmurHash();
?
??/**
???*?use?short?cuts?to?obtains?a?speed?optimized?simhash?calculation
???*
???*?@param?s
???*??????????input?string
???*?@return?64?bit?simhash?of?input?string
???*/
?
??private?static?final?int?FIXED_CGRAM_LENGTH?=?4;
?
??public?static?long?computeOptimizedSimHashForString(String?s)?{
????return?computeOptimizedSimHashForString(CharBuffer.wrap(s));
??}
?
??public?static?long?computeOptimizedSimHashForString(CharBuffer?s)?{
?
????LongSet?shingles?=?new?LongOpenHashSet(Math.min(s.length(),?100000));
?
????int?length?=?s.length();
?
????long?timeStart?=?System.currentTimeMillis();
????for?(int?i?=?0;?i??56);
??????longAsBytes[1]?=?(byte)?(shingle?>>?48);
??????longAsBytes[2]?=?(byte)?(shingle?>>?40);
??????longAsBytes[3]?=?(byte)?(shingle?>>?32);
??????longAsBytes[4]?=?(byte)?(shingle?>>?24);
??????longAsBytes[5]?=?(byte)?(shingle?>>?16);
??????longAsBytes[6]?=?(byte)?(shingle?>>?8);
??????longAsBytes[7]?=?(byte)?(shingle);
?
??????long?longHash?=?FPGenerator.std64.fp(longAsBytes,?0,?8);
??????for?(int?i?=?0;?i?>?i)?&?1L)?==?1L;
????????v[i]?+=?(bitSet)???1?:?-1;
??????}
????}
?
????long?simhash?=?0;
????for?(int?i?=?0;?i??0)?{
????????simhash?|=?(1L?<>?i)?&?1L)?==?1L;
????????v[i]?+=?(bitSet)???1?:?-1;
??????}
????}
????long?simhash?=?0;
????for?(int?i?=?0;?i??0)?{
????????simhash?|=?(1L?<>>?-1);
??}
?
??public?static?void?main(String[]?args)?{
????try?{
??????//?File?file1?=?new?File("/Users/rana/academia.edu_01.html");
??????//?File?file2?=?new?File("/Users/rana/academia.edu_02.html");
?
??????File?file1?=?new?File(args[0]);
??????File?file2?=?new?File(args[1]);
?
??????byte?data1[]?=?new?byte[(int)?file1.length()];
??????byte?data2[]?=?new?byte[(int)?file2.length()];
??????FileInputStream?stream1?=?new?FileInputStream(file1);
??????FileInputStream?stream2?=?new?FileInputStream(file2);
??????stream1.read(data1);
??????stream2.read(data2);
??????String?string1?=?new?String(data1);
??????String?string2?=?new?String(data2);
?
??????long?timeStart?=?System.currentTimeMillis();
??????long?simhash1?=?computeSimHashFromString(Shingle.shingles(string1));
??????long?timeEnd?=?System.currentTimeMillis();
??????System.out.println("Old?Calc?for?Document?A?Took:"
??????????+?(timeEnd?-?timeStart));
??????timeStart?=?System.currentTimeMillis();
??????long?simhash2?=?computeSimHashFromString(Shingle.shingles(string2));
??????timeEnd?=?System.currentTimeMillis();
??????System.out.println("Old?Calc?for?Document?B?Took:"
??????????+?(timeEnd?-?timeStart));
??????timeStart?=?System.currentTimeMillis();
??????long?simhash3?=?computeOptimizedSimHashForString(string1);
??????timeEnd?=?System.currentTimeMillis();
??????System.out.println("New?Calc?for?Document?A?Took:"
??????????+?(timeEnd?-?timeStart));
??????timeStart?=?System.currentTimeMillis();
??????long?simhash4?=?computeOptimizedSimHashForString(string2);
??????timeEnd?=?System.currentTimeMillis();
??????System.out.println("New?Calc?for?Document?B?Took:"
??????????+?(timeEnd?-?timeStart));
?
??????int?hammingDistance?=?hammingDistance(simhash1,?simhash2);
??????int?hammingDistance2?=?hammingDistance(simhash3,?simhash4);
?
??????System.out.println("hammingdistance?Doc?(A)?to?Doc(B)?OldWay:"
??????????+?hammingDistance);
??????System.out.println("hammingdistance?Doc?(A)?to?Doc(B)?NewWay:"
??????????+?hammingDistance2);
????}?catch?(IOException?e)?{
??????e.printStackTrace();
????}
??}
}以上就是小編今天的分享了,希望可以幫助到大家。
