00001
00002
00003
00004
00005
00006
00007
00008 #include "ibrcommon/data/BloomFilter.h"
00009 #include "ibrcommon/Exceptions.h"
00010 #include <limits>
00011 #include <math.h>
00012
00013 namespace ibrcommon
00014 {
00015 DefaultHashProvider::DefaultHashProvider(size_t salt_count)
00016 : salt_count_(salt_count)
00017 {
00018 generate_salt();
00019 }
00020
00021 DefaultHashProvider::~DefaultHashProvider()
00022 {
00023 }
00024
00025 bool DefaultHashProvider::operator==(const DefaultHashProvider &provider) const
00026 {
00027 return ( provider.salt_count_ == salt_count_);
00028 }
00029
00030 void DefaultHashProvider::clear()
00031 {
00032 _salt.clear();
00033 }
00034
00035 void DefaultHashProvider::add(bloom_type hash)
00036 {
00037 _salt.push_back(hash);
00038 }
00039
00040 size_t DefaultHashProvider::count() const
00041 {
00042 return _salt.size();
00043 }
00044
00045 const std::list<bloom_type> DefaultHashProvider::hash(const unsigned char* begin, std::size_t remaining_length) const
00046 {
00047 std::list<bloom_type> hashes;
00048
00049 for (std::vector<bloom_type>::const_iterator iter = _salt.begin(); iter != _salt.end(); iter++)
00050 {
00051 hashes.push_back(hash_ap(begin, remaining_length, (*iter)));
00052 }
00053
00054 return hashes;
00055 }
00056
00057
00058 void DefaultHashProvider::generate_salt()
00059 {
00060 const unsigned int predef_salt_count = 64;
00061 static const bloom_type predef_salt[predef_salt_count] =
00062 {
00063 0xAAAAAAAA, 0x55555555, 0x33333333, 0xCCCCCCCC,
00064 0x66666666, 0x99999999, 0xB5B5B5B5, 0x4B4B4B4B,
00065 0xAA55AA55, 0x55335533, 0x33CC33CC, 0xCC66CC66,
00066 0x66996699, 0x99B599B5, 0xB54BB54B, 0x4BAA4BAA,
00067 0xAA33AA33, 0x55CC55CC, 0x33663366, 0xCC99CC99,
00068 0x66B566B5, 0x994B994B, 0xB5AAB5AA, 0xAAAAAA33,
00069 0x555555CC, 0x33333366, 0xCCCCCC99, 0x666666B5,
00070 0x9999994B, 0xB5B5B5AA, 0xFFFFFFFF, 0xFFFF0000,
00071 0xB823D5EB, 0xC1191CDF, 0xF623AEB3, 0xDB58499F,
00072 0xC8D42E70, 0xB173F616, 0xA91A5967, 0xDA427D63,
00073 0xB1E8A2EA, 0xF6C0D155, 0x4909FEA3, 0xA68CC6A7,
00074 0xC395E782, 0xA26057EB, 0x0CD5DA28, 0x467C5492,
00075 0xF15E6982, 0x61C6FAD3, 0x9615E352, 0x6E9E355A,
00076 0x689B563E, 0x0C9831A8, 0x6753C18B, 0xA622689B,
00077 0x8CA63C47, 0x42CC2884, 0x8E89919B, 0x6EDBD7D3,
00078 0x15B6796C, 0x1D6FDFE4, 0x63FF9092, 0xE7401432
00079 };
00080
00081 if (salt_count_ > predef_salt_count)
00082 {
00083 throw ibrcommon::Exception("Max. 64 hash salts supported!");
00084 }
00085
00086 for(unsigned int i = 0; i < salt_count_; ++i)
00087 {
00088 add(predef_salt[i]);
00089 }
00090 }
00091
00092 bloom_type DefaultHashProvider::hash_ap(const unsigned char* begin, std::size_t remaining_length, bloom_type hash) const
00093 {
00094 const unsigned char* it = begin;
00095 while(remaining_length >= 2)
00096 {
00097 hash ^= (hash << 7) ^ (*it++) * (hash >> 3);
00098 hash ^= (~((hash << 11) + ((*it++) ^ (hash >> 5))));
00099 remaining_length -= 2;
00100 }
00101 if (remaining_length)
00102 {
00103 hash ^= (hash << 7) ^ (*it) * (hash >> 3);
00104 }
00105 return hash;
00106 }
00107
00108 const unsigned char BloomFilter::bit_mask[bits_per_char] = {
00109 0x01,
00110 0x02,
00111 0x04,
00112 0x08,
00113 0x10,
00114 0x20,
00115 0x40,
00116 0x80
00117 };
00118
00119 BloomFilter::BloomFilter(std::size_t table_size, std::size_t salt_count)
00120 : _hashp(salt_count), table_size_(table_size), _itemcount(0), salt_count_(salt_count)
00121 {
00122 bit_table_ = new cell_type[table_size_ / bits_per_char];
00123 std::fill_n(bit_table_,(table_size_ / bits_per_char),0x00);
00124 }
00125
00126 BloomFilter::BloomFilter(const BloomFilter& filter)
00127 : _hashp(filter._hashp), table_size_(filter.table_size_), _itemcount(filter._itemcount), salt_count_(filter.salt_count_)
00128 {
00129 bit_table_ = new cell_type[table_size_ / bits_per_char];
00130 std::copy(filter.bit_table_, filter.bit_table_ + (table_size_ / bits_per_char), bit_table_);
00131 }
00132
00133 BloomFilter::~BloomFilter()
00134 {
00135 delete[] bit_table_;
00136 }
00137
00138 BloomFilter& BloomFilter::operator=(const BloomFilter& filter)
00139 {
00140 _hashp = filter._hashp;
00141 table_size_ = filter.table_size_;
00142 delete[] bit_table_;
00143 bit_table_ = new cell_type[table_size_ / bits_per_char];
00144 std::copy(filter.bit_table_, filter.bit_table_ + (table_size_ / bits_per_char), bit_table_);
00145 return *this;
00146 }
00147
00148 bool BloomFilter::operator!() const
00149 {
00150 return (0 == table_size_);
00151 }
00152
00153 void BloomFilter::clear()
00154 {
00155 std::fill_n(bit_table_,(table_size_ / bits_per_char),0x00);
00156 _itemcount = 0;
00157 }
00158
00159 void BloomFilter::insert(const unsigned char* key_begin, const std::size_t length)
00160 {
00161 std::size_t bit_index = 0;
00162 std::size_t bit = 0;
00163
00164 std::list<bloom_type> hashes = _hashp.hash(key_begin, length);
00165
00166 for (std::list<bloom_type>::iterator iter = hashes.begin(); iter != hashes.end(); iter++)
00167 {
00168 compute_indices( (*iter), bit_index, bit );
00169 bit_table_[bit_index / bits_per_char] |= bit_mask[bit];
00170 }
00171
00172 if (_itemcount < std::numeric_limits<unsigned int>::max()) _itemcount++;
00173 }
00174
00175 void BloomFilter::insert(const std::string& key)
00176 {
00177 insert(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
00178 }
00179
00180 void BloomFilter::insert(const char* data, const std::size_t& length)
00181 {
00182 insert(reinterpret_cast<const unsigned char*>(data),length);
00183 }
00184
00185 bool BloomFilter::contains(const unsigned char* key_begin, const std::size_t length) const
00186 {
00187 std::size_t bit_index = 0;
00188 std::size_t bit = 0;
00189
00190 const std::list<bloom_type> hashes = _hashp.hash(key_begin, length);
00191
00192 for (std::list<bloom_type>::const_iterator iter = hashes.begin(); iter != hashes.end(); iter++)
00193 {
00194 compute_indices( (*iter), bit_index, bit );
00195 if ((bit_table_[bit_index / bits_per_char] & bit_mask[bit]) != bit_mask[bit])
00196 {
00197 return false;
00198 }
00199 }
00200
00201 return true;
00202 }
00203
00204 bool BloomFilter::contains(const std::string& key) const
00205 {
00206 return contains(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
00207 }
00208
00209 bool BloomFilter::contains(const char* data, const std::size_t& length) const
00210 {
00211 return contains(reinterpret_cast<const unsigned char*>(data),length);
00212 }
00213
00214 std::size_t BloomFilter::size() const
00215 {
00216 return (table_size_ / bits_per_char);
00217 }
00218
00219 BloomFilter& BloomFilter::operator &= (const BloomFilter& filter)
00220 {
00221
00222 if (
00223 (_hashp == filter._hashp) &&
00224 (table_size_ == filter.table_size_)
00225 )
00226 {
00227 for (std::size_t i = 0; i < (table_size_ / bits_per_char); ++i)
00228 {
00229 bit_table_[i] &= filter.bit_table_[i];
00230 }
00231 }
00232 return *this;
00233 }
00234
00235 BloomFilter& BloomFilter::operator |= (const BloomFilter& filter)
00236 {
00237
00238 if (
00239 (_hashp == filter._hashp) &&
00240 (table_size_ == filter.table_size_)
00241 )
00242 {
00243 for (std::size_t i = 0; i < (table_size_ / bits_per_char); ++i)
00244 {
00245 bit_table_[i] |= filter.bit_table_[i];
00246 }
00247 }
00248 return *this;
00249 }
00250
00251 BloomFilter& BloomFilter::operator ^= (const BloomFilter& filter)
00252 {
00253
00254 if (
00255 (_hashp == filter._hashp) &&
00256 (table_size_ == filter.table_size_)
00257 )
00258 {
00259 for (std::size_t i = 0; i < (table_size_ / bits_per_char); ++i)
00260 {
00261 bit_table_[i] ^= filter.bit_table_[i];
00262 }
00263 }
00264 return *this;
00265 }
00266
00267 void BloomFilter::load(const cell_type* data, size_t len)
00268 {
00269 if (len < table_size_)
00270 {
00271 std::copy(data, data + len, bit_table_);
00272 }
00273 else
00274 {
00275 std::copy(data, data + table_size_, bit_table_);
00276 }
00277
00278 _itemcount = 0;
00279 }
00280
00281 const cell_type* BloomFilter::table() const
00282 {
00283 return bit_table_;
00284 }
00285
00286 void BloomFilter::compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const
00287 {
00288 bit_index = hash % table_size_;
00289 bit = bit_index % bits_per_char;
00290 }
00291
00292 float BloomFilter::getAllocation() const
00293 {
00294 float m = table_size_;
00295 float n = _itemcount;
00296 float k = salt_count_;
00297
00298 return pow(1 - pow(1 - (1 / m), k * n), k);
00299 }
00300 }