001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.collections4.bloomfilter; 018 019import java.util.Objects; 020import java.util.function.IntPredicate; 021 022/** 023 * A Hasher that implements combinatorial hashing as described by 024 * <a href="https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf">Krisch and Mitzenmacher</a> using the enhanced double hashing technique 025 * described in the wikipedia article <a href="https://en.wikipedia.org/wiki/Double_hashing#Enhanced_double_hashing">Double Hashing</a>. 026 * <p> 027 * Common use for this hasher is to generate bit indices from a byte array output of a hashing 028 * or MessageDigest algorithm.</p> 029 * 030 * <h2>Thoughts on the hasher input</h2> 031 * 032 * <p>Note that it is worse to create smaller numbers for the {@code initial} and {@code increment}. If the {@code initial} is smaller than 033 * the number of bits in a filter then hashing will start at the same point when the size increases; likewise the {@code increment} will be 034 * the same if it remains smaller than the number of bits in the filter and so the first few indices will be the same if the number of bits 035 * changes (but is still larger than the {@code increment}). In a worse case scenario with small {@code initial} and {@code increment} for 036 * all items, hashing may not create indices that fill the full region within a much larger filter. Imagine hashers created with {@code initial} 037 * and {@code increment} values less than 255 with a filter size of 30000 and number of hash functions as 5. Ignoring the 038 * tetrahedral addition (a maximum of 20 for k=5) the max index is 255 * 4 + 255 = 1275, this covers 4.25% of the filter. This also 039 * ignores the negative wrapping but the behavior is the same, some bits cannot be reached. 040 * </p><p> 041 * So this needs to be avoided as the filter probability assumptions will be void. If the {@code initial} and {@code increment} are larger 042 * than the number of bits then the modulus will create a 'random' position and increment within the size. 043 * </p> 044 * 045 * @since 4.5.0-M1 046 */ 047public class EnhancedDoubleHasher implements Hasher { 048 049 /** 050 * Convert bytes to big-endian long filling with zero bytes as necessary. 051 * 052 * @param byteArray the byte array to extract the values from. 053 * @param offset the offset to start extraction from. 054 * @param len the length of the extraction, may be longer than 8. 055 * @return 056 */ 057 private static long toLong(final byte[] byteArray, final int offset, final int len) { 058 long val = 0; 059 int shift = Long.SIZE; 060 final int end = offset + Math.min(len, Long.BYTES); 061 for (int i = offset; i < end; i++) { 062 shift -= Byte.SIZE; 063 val |= (long) (byteArray[i] & 0xFF) << shift; 064 } 065 return val; 066 } 067 068 /** 069 * The initial hash value. 070 */ 071 private final long initial; 072 073 /** 074 * The value to increment the hash value by. 075 */ 076 private final long increment; 077 078 /** 079 * Constructs the EnhancedDoubleHasher from a byte array. 080 * <p> 081 * This method simplifies the conversion from a Digest or hasher algorithm output 082 * to the two values used by the EnhancedDoubleHasher.</p> 083 * <p>The byte array is split in 2 and the first 8 bytes of each half are interpreted as a big-endian long value. 084 * Excess bytes are ignored. 085 * If there are fewer than 16 bytes the following conversions are made. 086 * </p> 087 * <ol> 088 * <li>If there is an odd number of bytes the excess byte is assigned to the increment value</li> 089 * <li>The bytes allotted are read in big-endian order any byte not populated is set to zero.</li> 090 * </ol> 091 * <p> 092 * This ensures that small arrays generate the largest possible increment and initial values. 093 * </p> 094 * 095 * @param buffer the buffer to extract the longs from. 096 * @throws IllegalArgumentException is buffer length is zero. 097 */ 098 public EnhancedDoubleHasher(final byte[] buffer) { 099 if (buffer.length == 0) { 100 throw new IllegalArgumentException("buffer length must be greater than 0"); 101 } 102 // divide by 2 103 final int segment = buffer.length / 2; 104 this.initial = toLong(buffer, 0, segment); 105 this.increment = toLong(buffer, segment, buffer.length - segment); 106 } 107 108 /** 109 * Constructs the EnhancedDoubleHasher from 2 longs. The long values will be interpreted as unsigned values. 110 * 111 * @param initial The initial value for the hasher. 112 * @param increment The value to increment the hash by on each iteration. 113 */ 114 public EnhancedDoubleHasher(final long initial, final long increment) { 115 this.initial = initial; 116 this.increment = increment; 117 } 118 119 /** 120 * Gets the increment value for the hash calculation. 121 * 122 * @return the increment value for the hash calculation. 123 */ 124 long getIncrement() { 125 return increment; 126 } 127 128 /** 129 * Gets the initial value for the hash calculation. 130 * 131 * @return the initial value for the hash calculation. 132 */ 133 long getInitial() { 134 return initial; 135 } 136 137 @Override 138 public IndexExtractor indices(final Shape shape) { 139 Objects.requireNonNull(shape, "shape"); 140 141 return new IndexExtractor() { 142 143 @Override 144 public int[] asIndexArray() { 145 final int[] result = new int[shape.getNumberOfHashFunctions()]; 146 final int[] idx = new int[1]; 147 148 // This method needs to return duplicate indices 149 150 processIndices(i -> { 151 result[idx[0]++] = i; 152 return true; 153 }); 154 return result; 155 } 156 157 @Override 158 public boolean processIndices(final IntPredicate consumer) { 159 Objects.requireNonNull(consumer, "consumer"); 160 final int bits = shape.getNumberOfBits(); 161 // Enhanced double hashing: 162 // hash[i] = ( h1(x) + i*h2(x) + (i*i*i - i)/6 ) mod bits 163 // See: https://en.wikipedia.org/wiki/Double_hashing#Enhanced_double_hashing 164 // 165 // Essentially this is computing a wrapped modulus from a start point and an 166 // increment and an additional term as a tetrahedral number. 167 // You only need two modulus operations before the loop. Within the loop 168 // the modulus is handled using the sign bit to detect wrapping to ensure: 169 // 0 <= index < bits 170 // 0 <= inc < bits 171 // The final hash is: 172 // hash[i] = ( h1(x) - i*h2(x) - (i*i*i - i)/6 ) wrapped in [0, bits) 173 174 int index = BitMaps.mod(initial, bits); 175 if (!consumer.test(index)) { 176 return false; 177 } 178 int inc = BitMaps.mod(increment, bits); 179 180 final int k = shape.getNumberOfHashFunctions(); 181 182 if (k >= bits) { 183 // the tetraheadral incrementer. We need to ensure that this 184 // number does not exceed bits-1 or we may end up with an index > bits. 185 int tet = 1; 186 for (int i = 1; i < k; i++) { 187 // Update index and handle wrapping 188 index -= inc; 189 index = index < 0 ? index + bits : index; 190 if (!consumer.test(index)) { 191 return false; 192 } 193 194 // Incorporate the counter into the increment to create a 195 // tetrahedral number additional term, and handle wrapping. 196 inc -= tet; 197 inc = inc < 0 ? inc + bits : inc; 198 if (++tet == bits) { 199 tet = 0; 200 } 201 } 202 } else { 203 for (int i = 1; i < k; i++) { 204 // Update index and handle wrapping 205 index -= inc; 206 index = index < 0 ? index + bits : index; 207 if (!consumer.test(index)) { 208 return false; 209 } 210 211 // Incorporate the counter into the increment to create a 212 // tetrahedral number additional term, and handle wrapping. 213 inc -= i; 214 inc = inc < 0 ? inc + bits : inc; 215 } 216 217 } 218 return true; 219 } 220 }; 221 } 222}