1

我正在尝试为自定义哈希图(链表数组)编写代码,它可以存储 5 亿个值(键是链表数组号)并且可以将索引保存到磁盘。代码如下:

4

1 回答 1

2

这就是我将如何实现它

import java.io.File;
import java.io.IOException;
import java.nio.*;
import java.nio.channels.FileChannel;
import java.io.RandomAccessFile;
import java.util.Arrays;

class LongIntParallelHashMultimap {
    public static final int BUFFER_BITS = 24;
    private final FileChannel fc;
    private final ByteBuffer[] keys, data;
    private final int topBits, topMask, offsetMask;

    public LongIntParallelHashMultimap(String fileName, int sizeBits, boolean load) throws IOException {
        fc = new RandomAccessFile(fileName, "rw").getChannel();
        long totalSize = 4L << sizeBits;
        int bufferIndex = (int) (totalSize >> BUFFER_BITS);
        keys = new ByteBuffer[bufferIndex];
        data = new ByteBuffer[bufferIndex];
        int bufferSize = 1 << BUFFER_BITS;
        long offset = 0;
        for (int i = 0; i < bufferIndex; i++) {
            MappedByteBuffer kmap = fc.map(FileChannel.MapMode.READ_WRITE, offset, bufferSize);
            kmap.load();
            keys[i] = kmap.order(ByteOrder.nativeOrder());
            MappedByteBuffer dmap = fc.map(FileChannel.MapMode.READ_WRITE, offset + bufferSize, bufferSize);
            dmap.load();
            data[i] = dmap.order(ByteOrder.nativeOrder());
            offset += bufferSize * 2;
        }
        topBits = sizeBits + 2 - BUFFER_BITS;
        topMask = (1 << topBits) - 1;
        offsetMask = bufferSize - 4;
    }

    public void put(int key, int value) {
        int buffer = key & topMask;
        int key2 = (key >> topBits) + 1;
        assert key2 != 0;
        ByteBuffer keys2 = keys[buffer];
        ByteBuffer data2 = data[buffer];
        int offset = (key2 * 101) & offsetMask;
        while (keys2.getInt(offset) != 0) {
            offset += 3 * 4;
            offset &= offsetMask;
        }
        keys2.putInt(offset, key2);
        data2.putInt(offset, value);
    }

    public int get(int key, int[] values) {
        int buffer = key & topMask;
        int key2 = (key >> topBits) + 1;
        assert key2 != 0;
        ByteBuffer keys2 = keys[buffer];
        ByteBuffer data2 = data[buffer];
        int offset = (key2 * 101) & offsetMask;
        for (int count = 0; count < values.length; ) {
            int key3 = keys2.getInt(offset);
            if (key3 == 0)
                return count;
            if (key3 == key2)
                values[count++] = data2.getInt(offset);

            offset += 3 * 4;
            offset &= offsetMask;
        }
        return values.length;
    }

    private final int[] getValues = new int[1000];
    private static final int[] NO_VALUES = { };

    public int[] get(int key) {
        int len = get(key, getValues);
        return len == 0 ? NO_VALUES : Arrays.copyOf(getValues, len);
    }

    public static void main(String... args) throws IOException {
        int keys = 500 * 1000 * 1000;

        new File("abc.bin").delete();
        long startTime = System.nanoTime();
        LongIntParallelHashMultimap lph = new LongIntParallelHashMultimap("abc.bin", 30, true);
        long time = System.nanoTime() - startTime;
        System.out.printf("Load time was %.3f sec%n", time / 1e9);

        timePut(keys, lph);

        timeGet(keys, lph);

        timeGet2(keys, lph);

        startTime = System.nanoTime();
        System.gc();
        time = System.nanoTime() - startTime;
        System.out.printf("Time to Full GC was %.3f sec%n", time / 1e9);
    }

    private static void timePut(int keys, LongIntParallelHashMultimap lph) {
        long startTime;
        long time;
        startTime = System.nanoTime();
        for (int i = 0; i < keys; i++) {
            lph.put(i, i + 100);
            if ((i + 1) % 100_000_000 == 0)
                System.out.printf("%,d ", i + 1);
        }
        time = System.nanoTime() - startTime;
        System.out.printf("%nput time was %.3f sec%n", time / 1e9);
    }

    private static void timeGet(int keys, LongIntParallelHashMultimap lph) {
        long startTime;
        long time;
        startTime = System.nanoTime();
        int[] values = new int[2];
        for (int i = 0; i < keys; i++) {
            lph.get(i, values);
            if ((i + 1) % 100_000_000 == 0)
                System.out.printf("%,d ", i + 1);
        }
        time = System.nanoTime() - startTime;
        System.out.printf("%nget(key, values) time was %.3f sec%n", time / 1e9);
    }

    private static void timeGet2(int keys, LongIntParallelHashMultimap lph) {
        long startTime;
        long time;
        startTime = System.nanoTime();
        for (int i = 0; i < keys; i++) {
            lph.get(i);
            if ((i + 1) % 100_000_000 == 0)
                System.out.printf("%,d ", i + 1);
        }
        time = System.nanoTime() - startTime;
        System.out.printf("%nget(key) time was %.3f sec%n", time / 1e9);
    }
}

注意:没有额外的步骤来加载或保存数据。这意味着当你从put(key, value)数据返回时,即使你的程序崩溃了(假设你的操作系统没有崩溃)

这增加了 500m 个条目。运行-ea -verbosegc -mx16m最大堆为 16 MB。

Load time was 0.121 sec
100,000,000 200,000,000 300,000,000 400,000,000 500,000,000 
put time was 47.017 sec
100,000,000 200,000,000 300,000,000 400,000,000 500,000,000 
get(key, values) time was 50.858 sec
[ GC goes bananas for get(key) ]
100,000,000 200,000,000 300,000,000 400,000,000 500,000,000 
get(key) time was 87.634 sec
Time to Full GC was 0.015 sec

注意:它只是 GCed,因为我调用System.gc();并查看使用的堆的大小!


我会使用 aint[]而不是节点的链接列表。

import sun.nio.ch.DirectBuffer;

import java.io.IOException;
import java.util.*;
import java.nio.*;
import java.nio.channels.FileChannel;
import java.io.RandomAccessFile;

class LongIntParallelHashMultimap {
    private static final int[] NO_INTS = {};

    final int[][] data;

    public LongIntParallelHashMultimap() {
        data = new int[Integer.MAX_VALUE][];
    }

    public void put(int key, int value) {
        int[] ints = data[key];
        if (ints == null) {
            data[key] = new int[]{value};
        } else {
            int[] ints2 = Arrays.copyOf(ints, ints.length + 1);
            ints2[ints.length] = value;
            data[key] = ints2;
        }
    }

    public int[] get(int key) {
        int[] ints = data[key];
        return ints == null ? NO_INTS : ints;
    }

    private FileChannel channel;
    private MappedByteBuffer mbb;

    public void save() throws IOException {
        channel = new RandomAccessFile("abc.bin", "rw").getChannel();
        mbb = channel.map(FileChannel.MapMode.READ_WRITE, 0, 1 << 24);
        mbb.order(ByteOrder.nativeOrder());

        for (int i = 0; i < Integer.MAX_VALUE - 32; i += 32) {
            int bits = 0;
            for (int j = 0; j < 32; j++) {
                if (data[i + j] != null) bits |= 1;
                bits <<= 1;
            }
            getMbb().putInt(bits);
        }

        for (int i = 0; i < Integer.MAX_VALUE; i++) {
            int arr[] = get(i);
            if (arr.length == 0) continue;
            getMbb().putInt(arr.length);
            for (int a : arr)
                getMbb().putInt(a);
        }
        channel.close();
        cleanMbb();
    }

    private ByteBuffer getMbb() throws IOException {
        if (mbb.remaining() <= 0) {
            cleanMbb();
            mbb = channel.map(FileChannel.MapMode.READ_WRITE, channel.size(), 1 << 24);
            mbb.order(ByteOrder.nativeOrder());
        }
        return mbb;
    }

    private void cleanMbb() {
        ((DirectBuffer) mbb).cleaner().clean();
    }


    public static void main(String... args) throws IOException {
        int keys = 50 * 1000 * 1000;

        long startTime = System.nanoTime();
        LongIntParallelHashMultimap lph = new LongIntParallelHashMultimap();
        long time = System.nanoTime() - startTime;
        System.out.printf("Create time %.3f sec%n", time / 1e9);

        startTime = System.nanoTime();
        for (int i = 0; i < keys; i++) {
            lph.put(i, i + 100);
            if (i % 10000000 == 0 && i != 0)
                System.out.print(" " + i + " ");
        }
        time = System.nanoTime() - startTime;
        System.out.printf("%nput time was %.3f sec%n", time / 1e9);

        startTime = System.nanoTime();
        lph.save();
        time = System.nanoTime() - startTime;
        System.out.printf(" time to save was %.3f sec%n", time / 1e9);

        startTime = System.nanoTime();
        for (int i = 0; i < keys; i++) {
            int k[] = lph.get(i);
        }
        time = System.nanoTime() - startTime;
        System.out.printf("get time was %.3f sec%n", time / 1e9);
    }
}
于 2012-08-03T14:28:09.667 回答