1

I've programmed a (very simple) benchmark in Java. It simply increments a double value up to a specified value and takes the time.

When I use this singlethreaded or with a low amount of threads (up to 100) on my 6-core desktop, the benchmark returns reasonable and repeatable results.

But when I use for example 1200 threads, the average multicore duration is significantly lower than the singlecore duration (about 10 times or more). I've made sure that the total amount of incrementations is the same, no matter how much threads I use.

Why does the performance drop so much with more threads? Is there a trick to solve this problem?

I'm posting my source, but I don't think, that there is a problem.

Benchmark.java:

package sibbo.benchmark;

import java.text.DecimalFormat;
import java.util.LinkedList;
import java.util.List;

public class Benchmark implements TestFinishedListener {
            private static final double TARGET = 1e10;
    private static final int THREAD_MULTIPLICATOR = 2;

    public static void main(String[] args) throws InterruptedException {
        Benchmark b = new Benchmark(TARGET);
        b.start();
    }

    private int coreCount;
    private List<Worker> workers = new LinkedList<>();
    private List<Worker> finishedWorkers = new LinkedList<>();
    private double target;

    public Benchmark(double target) {
        this.target = target;
        getSystemInfos();
        printInfos();
    }

    private void getSystemInfos() {
        coreCount = Runtime.getRuntime().availableProcessors();
    }

    private void printInfos() {
        System.out.println("Usable cores: " + coreCount);
        System.out.println("Multicore threads: " + coreCount *                 THREAD_MULTIPLICATOR);
        System.out.println("Loops per core: " + new DecimalFormat("###,###,###,###,##0").format(TARGET));

        System.out.println();
    }

    public synchronized void start() throws InterruptedException {
        Thread.currentThread().setPriority(Thread.MAX_PRIORITY);

        System.out.print("Initializing singlecore benchmark... ");
        Worker w = new Worker(this, 0);
        workers.add(w);

        Thread.sleep(1000);
        System.out.println("finished");

        System.out.print("Running singlecore benchmark... ");
        w.runBenchmark(target);
        wait();

        System.out.println("finished");
        printResult();

        System.out.println();
        // Multicore
        System.out.print("Initializing multicore benchmark...  ");
        finishedWorkers.clear();

        for (int i = 0; i < coreCount * THREAD_MULTIPLICATOR; i++) {
            workers.add(new Worker(this, i));
        }

        Thread.sleep(1000);
        System.out.println("finished");

        System.out.print("Running multicore benchmark...  ");

        for (Worker worker : workers) {
            worker.runBenchmark(target / THREAD_MULTIPLICATOR);
        }

        wait();

        System.out.println("finished");
        printResult();

        Thread.currentThread().setPriority(Thread.NORM_PRIORITY);
    }

    private void printResult() {
        DecimalFormat df = new DecimalFormat("###,###,###,##0.000");

        long min = -1, av = 0, max = -1;
        int threadCount = 0;
        boolean once = true;

        System.out.println("Result:");

        for (Worker w : finishedWorkers) {
            if (once) {
                once = false;

                min = w.getTime();
                max = w.getTime();
            }

            if (w.getTime() > max) {
                max = w.getTime();
            }

            if (w.getTime() < min) {
                min = w.getTime();
            }

            threadCount++;
            av += w.getTime();

            if (finishedWorkers.size() <= 6) {
                System.out.println("Worker " + w.getId() + ": " + df.format(w.getTime() / 1e9) + "s");
            }
        }

        System.out.println("Min: " + df.format(min / 1e9) + "s, Max: " + df.format(max / 1e9) + "s, Av per Thread: "
                + df.format((double) av / threadCount / 1e9) + "s");
    }

    @Override
    public synchronized void testFinished(Worker w) {
        workers.remove(w);
        finishedWorkers.add(w);

        if (workers.isEmpty()) {
            notify();
        }
    }
}

Worker.java:

package sibbo.benchmark;

public class Worker implements Runnable {
    private double value = 0;
    private long time;
    private double target;
    private TestFinishedListener l;
    private final int id;

    public Worker(TestFinishedListener l, int id) {
        this.l = l;
        this.id = id;

        new Thread(this).start();
    }

    public int getId() {
        return id;
    }

    public synchronized void runBenchmark(double target) {
        this.target = target;
        notify();
    }

    public long getTime() {
        return time;
    }

    @Override
    public void run() {
        synWait();
        value = 0;
        long startTime = System.nanoTime();

        while (value < target) {
            value++;
        }

        long endTime = System.nanoTime();
        time = endTime - startTime;

        l.testFinished(this);
    }

    private synchronized void synWait() {
        try {
            wait();
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
    }
}
4

2 回答 2

6

您需要了解操作系统(或 Java 线程调度程序,或两者兼有)正在尝试在应用程序中的所有线程之间进行平衡,以使它们都有机会执行某些工作,并且在两者之间切换的成本非零线程。使用 1200 个线程,您刚刚达到(并且可能远远超过)处理器在上下文切换上花费的时间比实际工作更多的临界点。

这是一个粗略的类比:

你在 A 房间有一项工作要做。你每天在 A 房间站 8 个小时,然后做你的工作。

然后你的老板过来告诉你,你也必须在 B 房间做一份工作。现在你需要定期离开房间 A,沿着大厅走到 B 房间,然后再走回去。每天步行需要1分钟。现在,您花费 3 小时 59.5 分钟完成每项工作,并在房间之间走动一分钟。

现在假设您有 1200 个房间要工作。您将花费更多时间在房间之间走动,而不是实际工作。这是您将处理器放入的情况。它花费了大量时间在上下文之间切换,以至于没有完成任何实际工作。

编辑:现在,根据下面的评论,也许您在继续之前在每个房间中花费了固定的时间 - 您的工作会有所进展,但房间之间的上下文切换次数仍然会影响单个任务的整体运行时间。

于 2012-07-05T15:04:07.643 回答
1

好的,我想我已经找到了我的问题,但直到现在,还没有解决方案。

在测量每个线程运行以完成其部分工作的时间时,不同的线程总数有不同的可能最小值。最大值每次都相同。如果线程首先启动,然后经常暂停并最后完成。例如,这个最大值可以是 10 秒。假设每个线程完成的操作总量保持不变,那么无论我使用多少线程,当使用不同数量的线程时,必须更改单个线程完成的操作量。例如,使用一个线程,它必须执行 1000 次操作,但使用十个线程,每个人只需执行 100 次操作。现在,使用十个线程,一个线程可以使用的最短时间远低于使用一个线程。所以计算每个线程完成工作所需的平均时间是无稽之谈。使用十个线程的最小值为 1 秒。如果一个线程不间断地工作,就会发生这种情况。

编辑

解决方案是简单地测量第一个线程开始和最后一个线程完成之间的时间量。

于 2012-07-05T15:26:47.880 回答