我正在尝试为自定义哈希图(链表数组)编写代码,它可以存储 5 亿个值(键是链表数组号)并且可以将索引保存到磁盘。代码如下:
问问题
168 次
1 回答
2
这就是我将如何实现它
import java.io.File;
import java.io.IOException;
import java.nio.*;
import java.nio.channels.FileChannel;
import java.io.RandomAccessFile;
import java.util.Arrays;
class LongIntParallelHashMultimap {
public static final int BUFFER_BITS = 24;
private final FileChannel fc;
private final ByteBuffer[] keys, data;
private final int topBits, topMask, offsetMask;
public LongIntParallelHashMultimap(String fileName, int sizeBits, boolean load) throws IOException {
fc = new RandomAccessFile(fileName, "rw").getChannel();
long totalSize = 4L << sizeBits;
int bufferIndex = (int) (totalSize >> BUFFER_BITS);
keys = new ByteBuffer[bufferIndex];
data = new ByteBuffer[bufferIndex];
int bufferSize = 1 << BUFFER_BITS;
long offset = 0;
for (int i = 0; i < bufferIndex; i++) {
MappedByteBuffer kmap = fc.map(FileChannel.MapMode.READ_WRITE, offset, bufferSize);
kmap.load();
keys[i] = kmap.order(ByteOrder.nativeOrder());
MappedByteBuffer dmap = fc.map(FileChannel.MapMode.READ_WRITE, offset + bufferSize, bufferSize);
dmap.load();
data[i] = dmap.order(ByteOrder.nativeOrder());
offset += bufferSize * 2;
}
topBits = sizeBits + 2 - BUFFER_BITS;
topMask = (1 << topBits) - 1;
offsetMask = bufferSize - 4;
}
public void put(int key, int value) {
int buffer = key & topMask;
int key2 = (key >> topBits) + 1;
assert key2 != 0;
ByteBuffer keys2 = keys[buffer];
ByteBuffer data2 = data[buffer];
int offset = (key2 * 101) & offsetMask;
while (keys2.getInt(offset) != 0) {
offset += 3 * 4;
offset &= offsetMask;
}
keys2.putInt(offset, key2);
data2.putInt(offset, value);
}
public int get(int key, int[] values) {
int buffer = key & topMask;
int key2 = (key >> topBits) + 1;
assert key2 != 0;
ByteBuffer keys2 = keys[buffer];
ByteBuffer data2 = data[buffer];
int offset = (key2 * 101) & offsetMask;
for (int count = 0; count < values.length; ) {
int key3 = keys2.getInt(offset);
if (key3 == 0)
return count;
if (key3 == key2)
values[count++] = data2.getInt(offset);
offset += 3 * 4;
offset &= offsetMask;
}
return values.length;
}
private final int[] getValues = new int[1000];
private static final int[] NO_VALUES = { };
public int[] get(int key) {
int len = get(key, getValues);
return len == 0 ? NO_VALUES : Arrays.copyOf(getValues, len);
}
public static void main(String... args) throws IOException {
int keys = 500 * 1000 * 1000;
new File("abc.bin").delete();
long startTime = System.nanoTime();
LongIntParallelHashMultimap lph = new LongIntParallelHashMultimap("abc.bin", 30, true);
long time = System.nanoTime() - startTime;
System.out.printf("Load time was %.3f sec%n", time / 1e9);
timePut(keys, lph);
timeGet(keys, lph);
timeGet2(keys, lph);
startTime = System.nanoTime();
System.gc();
time = System.nanoTime() - startTime;
System.out.printf("Time to Full GC was %.3f sec%n", time / 1e9);
}
private static void timePut(int keys, LongIntParallelHashMultimap lph) {
long startTime;
long time;
startTime = System.nanoTime();
for (int i = 0; i < keys; i++) {
lph.put(i, i + 100);
if ((i + 1) % 100_000_000 == 0)
System.out.printf("%,d ", i + 1);
}
time = System.nanoTime() - startTime;
System.out.printf("%nput time was %.3f sec%n", time / 1e9);
}
private static void timeGet(int keys, LongIntParallelHashMultimap lph) {
long startTime;
long time;
startTime = System.nanoTime();
int[] values = new int[2];
for (int i = 0; i < keys; i++) {
lph.get(i, values);
if ((i + 1) % 100_000_000 == 0)
System.out.printf("%,d ", i + 1);
}
time = System.nanoTime() - startTime;
System.out.printf("%nget(key, values) time was %.3f sec%n", time / 1e9);
}
private static void timeGet2(int keys, LongIntParallelHashMultimap lph) {
long startTime;
long time;
startTime = System.nanoTime();
for (int i = 0; i < keys; i++) {
lph.get(i);
if ((i + 1) % 100_000_000 == 0)
System.out.printf("%,d ", i + 1);
}
time = System.nanoTime() - startTime;
System.out.printf("%nget(key) time was %.3f sec%n", time / 1e9);
}
}
注意:没有额外的步骤来加载或保存数据。这意味着当你从put(key, value)
数据返回时,即使你的程序崩溃了(假设你的操作系统没有崩溃)
这增加了 500m 个条目。运行-ea -verbosegc -mx16m
最大堆为 16 MB。
Load time was 0.121 sec
100,000,000 200,000,000 300,000,000 400,000,000 500,000,000
put time was 47.017 sec
100,000,000 200,000,000 300,000,000 400,000,000 500,000,000
get(key, values) time was 50.858 sec
[ GC goes bananas for get(key) ]
100,000,000 200,000,000 300,000,000 400,000,000 500,000,000
get(key) time was 87.634 sec
Time to Full GC was 0.015 sec
注意:它只是 GCed,因为我调用System.gc();
并查看使用的堆的大小!
我会使用 aint[]
而不是节点的链接列表。
import sun.nio.ch.DirectBuffer;
import java.io.IOException;
import java.util.*;
import java.nio.*;
import java.nio.channels.FileChannel;
import java.io.RandomAccessFile;
class LongIntParallelHashMultimap {
private static final int[] NO_INTS = {};
final int[][] data;
public LongIntParallelHashMultimap() {
data = new int[Integer.MAX_VALUE][];
}
public void put(int key, int value) {
int[] ints = data[key];
if (ints == null) {
data[key] = new int[]{value};
} else {
int[] ints2 = Arrays.copyOf(ints, ints.length + 1);
ints2[ints.length] = value;
data[key] = ints2;
}
}
public int[] get(int key) {
int[] ints = data[key];
return ints == null ? NO_INTS : ints;
}
private FileChannel channel;
private MappedByteBuffer mbb;
public void save() throws IOException {
channel = new RandomAccessFile("abc.bin", "rw").getChannel();
mbb = channel.map(FileChannel.MapMode.READ_WRITE, 0, 1 << 24);
mbb.order(ByteOrder.nativeOrder());
for (int i = 0; i < Integer.MAX_VALUE - 32; i += 32) {
int bits = 0;
for (int j = 0; j < 32; j++) {
if (data[i + j] != null) bits |= 1;
bits <<= 1;
}
getMbb().putInt(bits);
}
for (int i = 0; i < Integer.MAX_VALUE; i++) {
int arr[] = get(i);
if (arr.length == 0) continue;
getMbb().putInt(arr.length);
for (int a : arr)
getMbb().putInt(a);
}
channel.close();
cleanMbb();
}
private ByteBuffer getMbb() throws IOException {
if (mbb.remaining() <= 0) {
cleanMbb();
mbb = channel.map(FileChannel.MapMode.READ_WRITE, channel.size(), 1 << 24);
mbb.order(ByteOrder.nativeOrder());
}
return mbb;
}
private void cleanMbb() {
((DirectBuffer) mbb).cleaner().clean();
}
public static void main(String... args) throws IOException {
int keys = 50 * 1000 * 1000;
long startTime = System.nanoTime();
LongIntParallelHashMultimap lph = new LongIntParallelHashMultimap();
long time = System.nanoTime() - startTime;
System.out.printf("Create time %.3f sec%n", time / 1e9);
startTime = System.nanoTime();
for (int i = 0; i < keys; i++) {
lph.put(i, i + 100);
if (i % 10000000 == 0 && i != 0)
System.out.print(" " + i + " ");
}
time = System.nanoTime() - startTime;
System.out.printf("%nput time was %.3f sec%n", time / 1e9);
startTime = System.nanoTime();
lph.save();
time = System.nanoTime() - startTime;
System.out.printf(" time to save was %.3f sec%n", time / 1e9);
startTime = System.nanoTime();
for (int i = 0; i < keys; i++) {
int k[] = lph.get(i);
}
time = System.nanoTime() - startTime;
System.out.printf("get time was %.3f sec%n", time / 1e9);
}
}
于 2012-08-03T14:28:09.667 回答