c++ - 如何加速这个 C++ 代码（尤其是阶乘和 pow 部分）

Question

我在 Hackerrank 上做这个叫做算术级数的问题。

https://www.hackerrank.com/challenges/arithmetic-progressions

我的解决方案通过了前六项测试。我已经尝试了所有可能的方式来优化我的代码，包括缓存，更高效的操作。我还是没能通过考试。我认为我失败的两个地方是阶乘函数和 pow 函数。

基本上，我使用 unordered_map 来存储之前的所有结果。如果参数是关键之一，我将立即返回结果。

这是我的代码：

#include <cstdio>
#include <vector>
#include <iostream>
#include <algorithm>
#include <unordered_map>
#include <utility>
#include <ctime>
#include <sys/time.h>
using namespace std;


#define mod(m) (m > 1000003) ? m % 1000003 : m

//used for hashing the pair
namespace std {

template<typename a, typename b>
struct hash< pair<a, b> > {
private:
   const hash<a> ah;
   const hash<b> bh;
public:
   hash() : ah(), bh() {};
   size_t operator()(const std::pair<a, b> &p) const {
      return ah(p.first) ^ bh(p.second);
   }
};
} // namespaces

//the code below is used for collecting statistics
/*
typedef unsigned long long timestamp_t;

static timestamp_t get_timestamp ()
{
    struct timeval now;
    gettimeofday (&now, NULL);
    return  now.tv_usec + (timestamp_t)now.tv_sec * 1000000;
}
*/

//note that the number could get really large, that's why I use unsigned long long for most of data type
static unsigned long long cache_D[100000];
static unsigned long long cache_d[100000];
static unsigned long long cache_p[100000];
static unordered_map<unsigned long long, unsigned long long> cache_F;           //use       unordered_map to store the factorial, avg insert, lookup O(1)
static unordered_map<pair<unsigned long long, unsigned long long>, unsigned long long>   cache_P; //use unordered_map to store the pow
//static double pow_sec = 0;
//static double fac_sec = 0;


/**
 * Use the fast pow algorithm. On top of that, I add caching (stored in unordered map) to     speed up the pow
 * @param  x base
 * @param  y exponent
 * @return   x ^ y
 */
unsigned long long pow(unsigned long long x, unsigned long long y)
{
    //timestamp_t t0 = get_timestamp();
    pair<unsigned long long, unsigned long long> curr(x, y);
    if(cache_P.find(curr) != cache_P.end())
    {
        //timestamp_t t1 = get_timestamp();
        //pow_sec += (t1 - t0) / 1000000.0L;
        return cache_P[curr];
    }
    unsigned long long result = 1;
    unsigned long long mod_x =  mod(x);
    //unsigned long long count = 0;

    while( y )
    {
        if ( y & 1 )
        {
            unsigned long long temp = result *  mod_x;
            result = mod(temp);
        }
        y >>= 1;
        unsigned long long temp = mod_x *  mod_x;
        mod_x =  mod(temp);
    }
    cache_P[curr] = result;
    //timestamp_t t1 = get_timestamp();
    //pow_sec += (t1 - t0) / 1000000.0L;
    return result;
}

/**
 * same idea as pow, caching whenever I can
 * @param  x number to be factorialized
 * @return   x!
 */
unsigned long long factorial(unsigned long long x)
{
    //timestamp_t t0 = get_timestamp();
    if (cache_F.find(x) != cache_F.end())
    {
        //timestamp_t t1 = get_timestamp();
        //fac_sec += (t1 - t0) / 1000000.0L;
        return cache_F[x];
    }
    else
    {
        unsigned long long result = 1;
        //here we go from x to 1 since we could speed up operation as soon as we have x - 1 or x - 2 or x - 3 in our caching (just x * (x - 1)! )
        for(unsigned long long i = x; i >= 1; i--)
        {
            if(cache_F.find(i) != cache_F.end())
            {
                unsigned long long temp1 = result * cache_F[i];
                result = mod(temp1);
                cache_F[x] = result;
                //timestamp_t t1 = get_timestamp();
                //fac_sec += (t1 - t0) / 1000000.0L;
                return result;
            }
            unsigned long long mod_i = mod(i);
            unsigned long long temp2 = result * mod_i;
            result = mod(temp2);
        }
        cache_F[x] = result;
        //timestamp_t t1 = get_timestamp();
        //fac_sec += (t1 - t0) / 1000000.0L;
        return result;
    } 
}
void query(int from, int to)
{
    unsigned long long k = 0;
    unsigned long long constant = 1;

    for(int i = from - 1; i < to; i++)
    {
        k += cache_p[i];
        unsigned long long temp = constant * cache_D[i];
        constant = mod(temp);
    }
    unsigned long long temp = constant * factorial(k);
    constant = mod(temp);
    printf("%llu %llu\n", k, constant);
}


void update(int from, int to, int how_much)
{
    for(int i = from - 1; i < to; i++)
    {
         cache_p[i] += how_much;
         unsigned long long temp = cache_D[i] * pow(cache_d[i], (unsigned long long)how_much);
         cache_D[i] = mod(temp);
    }
}

int main() {
    int num_vec, num_operations;
    FILE *pFile = fopen("input.txt", "r");

    fscanf(pFile, "%d", &num_vec);
    for(int i = 0; i < num_vec; i++)
    {
        unsigned long long a, d, q;
        fscanf(pFile, "%llu %llu %llu", &a, &d, &q);
        cache_d[i] = d;
        cache_p[i] = q;
        cache_D[i] = pow(d, q);
    }
    fscanf(pFile, "%d", &num_operations);
    for(int i = 0; i < num_operations; i++)
    {
        int what_operation, from, to;
        fscanf(pFile, "%d %d %d", &what_operation, &from, &to);
        if(what_operation == 0)
        {
             query(from, to);
        }
        else if (what_operation == 1)
        {
            int add_q;
            fscanf(pFile, "%d", &add_q);
            update(from, to, add_q);
        }
    }
    printf("sec for pow: %f\n sec for fac: %f", pow_sec, fac_sec);
    return 0;
}

如果有人知道如何进一步优化我的代码，那将非常有帮助。谢谢！

score 5 · Accepted Answer

关于阶乘。

“unsigned long long”的范围从 0 到

18,446,744,073,709,551,615

21的阶乘是：

51,090,942,171,709,440,000

因此，这意味着您的函数只能返回 0 到 20 阶乘的正确结果。

从具有 21 个元素（0 到 20）的预构建缓存开始会容易得多。

unsigned long long fact_cache [21] = {

 1                      ,
 1                      ,
 2                      ,
 6                      ,
 24                     ,
 120                    ,
 720                    ,
 5040                   ,
 40320                  ,
 362880                 ,
 3628800                ,
 39916800               ,
 479001600              ,  
 6227020800             ,
 87178291200            ,
 1307674368000          ,
 20922789888000         ,
 355687428096000        ,
 6402373705728000       ,
 121645100408832000     ,
 2432902008176640000                      

};

现在您的阶乘函数可以在数组中查找正确的值（如果您愿意，可以检查边界）。

编辑：（实现了 OP 预期的“阶乘mod 1000003 ”）

我从阅读以下答案开始：

计算n的快速方法！mod m 其中 m 是素数？

基于第二个答案，带有python中的代码示例，以及更多信息：

http://www.algorithmist.com/index.php/Modular_inverse

他给出了这个 Python 代码，以及一个可能的改进。

def factorialMod(n, modulus):
    ans=1
    if n <= modulus//2:
        #calculate the factorial normally (right argument of range() is exclusive)
        for i in range(1,n+1):
            ans = (ans * i) % modulus   
    else:
        #Fancypants method for large n
        for i in range(n+1,modulus):
            ans = (ans * i) % modulus
        ans = modinv(ans, modulus)
        ans = -1*ans + modulus
    return ans % modulus

与原始方法相比，当 n 大于模除以 2 时，这具有一些优势，因为我们可以通过求解模逆来减少乘法的次数。

因为我们知道模数 (1000003)，我们可以通过使用它的 totient (1000002) 来求解 modinv。

在伪代码中，我们得到如下内容：

long factorial_mod( long n ) {
    if( n > 1000003 )
         return 0;  //from Thomas's comment

    if( cache[n] != 0 ) //if the cache has the answer
        return cache[n];

    long result = 1;
    if ( n <= 500001 )
    {
        for( long i = 1; i <= n; i++ )
        {
            result = (result * i) % 1000003;
            cache[i] = result;
        }
    }
    else
    {
        for( long i = n+1; i <= 1000003; i++)
        {
            result = (result * i) % 1000003;
        }
        result = modinv(result, 1000003);
        result = -1*result + 1000003; 
    }

    result = result % 1000003; 
    cache[n] = result;
    return result;
}

long modinv( long a, int modulus ) {
    return modPow( a, 1000002, modulus); // ( (a to the totient) mod the modulus )
}

如果我们不想计算总分，我们可以使用扩展欧拉 GCD 来求解模逆。（当然，素数的总和很容易计算……只需减去一个）。

c++ - 如何加速这个 C++ 代码（尤其是阶乘和 pow 部分）

1 回答 1

Related

Reference