文章/答案/技术大牛

发布

社区首页 >问答首页 >C# TPL比C++ PPL快？

问C# TPL比C++ PPL快？
EN

Stack Overflow用户

提问于 2012-01-03 12:23:31

回答 3查看 2.9K关注 0票数 6

我编写了一个非常简单的应用程序，它使用Fibonacci字体来比较TPL的Parallel.ForEach和PPL的parallel_for_each，结果非常奇怪，在有8个核的pc上，c#比c++快11秒。

同样的结果在vs2010和vs 2011年预览。

C#代码：

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Collections.Concurrent;
using System.Threading.Tasks;
using System.Diagnostics;

namespace ConsoleApplication1
{
    class Program
    {

            static void Main(string[] args)
            {
                var ll = new ConcurrentQueue<Tuple<int, int>>();
                var a = new int[12] { 40, 41, 42, 43, 44, 45, 46, 47, 35, 25, 36, 37 };

                long elapsed = time_call(() =>
                {
                    Parallel.ForEach(a, (n) => { ll.Enqueue(new Tuple<int, int>(n, fibonacci(n))); });
                });

                Console.WriteLine("TPL C# elapsed time: " + elapsed + "\n\r");
                foreach (var ss in ll)
                {
                    Console.WriteLine(String.Format("fib<{0}>: {1}", ss.Item1, +ss.Item2));
                }

                 Console.ReadLine();
            }

            static long time_call(Action f)
            {
                var p = Stopwatch.StartNew();
                p.Start();
                f();
                p.Stop();
                return p.ElapsedMilliseconds;
            }

             Computes the nth Fibonacci number.
            static int fibonacci(int n)
            {
                if (n < 2) return n;
                return fibonacci(n - 1) + fibonacci(n - 2);
            }
        }
    }

c++代码：

#include <windows.h>
#include <ppl.h>
#include <concurrent_vector.h>
#include <array>
#include <tuple>
#include <algorithm>
#include <iostream>

using namespace Concurrency;
using namespace std;

template <class Function>
__int64 time_call(Function&& f) {
    __int64 begin = GetTickCount();
    f();
    return GetTickCount() - begin;
}

// Computes the nth Fibonacci number.
int fibonacci(int n) {
    if (n < 2) return n;
    return fibonacci(n-1) + fibonacci(n-2);
}

int wmain() {
    __int64 elapsed;
    array<int, 12> a ={ 40, 41, 42, 43, 44, 45, 46, 47, 35, 25, 36, 37 };
    concurrent_vector<tuple<int,int>> results2;

    elapsed = time_call([&]{
        parallel_for_each(a.begin(), a.end(), [&](int n) {
            results2.push_back(make_tuple(n, fibonacci(n)));
        });
    });  

    wcout << L"PPL  time: " << elapsed << L" ms" << endl << endl;
    for_each (results2.begin(), results2.end(), [](tuple<int,int>& pair) {
        wcout << L"fib(" << get<0>(pair) << L"): " << get<1>(pair) << endl;
    });

    cin.ignore();
}

你能告诉我，我的c++代码哪里出错了吗？

宽度group_task与c#代码具有相同的时间：

task_group tasks;
    elapsed = time_call([&] 
    {
        for_each(begin(a), end(a), [&](int n) 
        {
            tasks.run([&,n]{results2.push_back(make_tuple(n, fibonacci(n)));});
        });
        tasks.wait();

c#-4.0

c++

回答 3

Stack Overflow用户

发布于 2012-01-04 08:12:42

以下是Rahul诉Patil微软团队的解释

你好，

谢谢你提起这个。实际上，您已经确定了与*的默认并行相关的开销--特别是当迭代次数较少且工作大小是可变的时候。默认的并行程序首先将工作分解成8个块(在8个核上)。当工作完成时，工作是动态负载平衡的.在大多数情况下，缺省操作非常有效(大量迭代)，并且当每个迭代的底层工作没有很好地理解时(假设您调用了一个库)--但在某些情况下，它确实带有不可接受的开销。

解决方案正是您在替代执行过程中所确定的。为此，我们将在下一个版本的Visual中提供一个名为"simple“的分区器，它将类似于您描述的替代实现，并且将具有更好的性能。

PS:每个实现的C#和C++并行在迭代过程中使用的算法略有不同，因此，根据工作负载的不同，您将看到稍微不同的性能特征。

看待

票数 8

Stack Overflow用户

发布于 2015-07-17 22:59:59

您的代码存在一些问题，让我们逐一解决它们：

使用递归计算Fibonacci会导致进程使用过多的内存量，因为它使用调用堆栈来计算结果。具有递归Fibonacci意味着您不是比较C# / C++并行框架，而是比较调用堆栈机制。您可以更快地计算斐波纳契：

int fibonacci(int n) 
{
    int curr = 1, prev = 0, total = 0;
    for (int i = 0; i < n; i++)
    {
        int pc = curr;
        curr += prev;
        total += curr;
        prev = pc;
    }
    return total;
}

使用这个函数，我必须运行至少一百万次才能得到可测量的值。

使用GetTickCount64而不是GetTickCount：

template <class Function>
__int64 time_call(Function&& f) 
{
    __int64 begin = ::GetTickCount64();
    f();
    return GetTickCount64() - begin;
}

使用如此小的机身运行parallel_for实际上会损害性能。最好是使用一个更粒状的函子。

考虑到这些特性，下面是C++中的代码：

// ParallelFibo.cpp : Defines the entry point for the console application.
//

#include "stdafx.h"
#include <windows.h>
#include <ppl.h>
#include <concurrent_vector.h>
#include <array>
#include <tuple>
#include <algorithm>
#include <iostream>
#include <random>

using namespace Concurrency;
using namespace std;

template <class Function>
__int64 time_call(Function&& f) 
{
    __int64 begin = ::GetTickCount64();
    f();
    return GetTickCount64() - begin;
}

// Computes the nth Fibonacci number.
inline int fibonacci(int n) 
{
    int curr = 1, prev = 0, total = 0;
    for (int i = 0; i < n; i++)
    {
        int pc = curr;
        curr += prev;
        total += curr;
        prev = pc;
    }
    return total;
}

#define NUMBER_OF_REPETITIONS 1000000
#define MIN_FIBO              37
#define MAX_FIBO              49

int main()
{
    __int64 elapsed;
    vector<int> v;
    for (int i = MIN_FIBO; i < MAX_FIBO; i++)
    {
        v.emplace_back(i);
    }

    concurrent_vector<tuple<int, int>> results;
    elapsed = time_call([&] {
        parallel_for(MIN_FIBO, MAX_FIBO, [&](int n) {
            for (int i = 0; i < NUMBER_OF_REPETITIONS; i++)
            {
                results.push_back(make_tuple(n, fibonacci(n)));
            }
        });
    });
    wcout << L"PPL  time: " << elapsed << L" ms" << endl << endl;
    cin.ignore();
    return 0;
}

下面是C#中的代码：

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Collections.Concurrent;
using System.Diagnostics;

namespace ParallelFiboCS
{
    class Program
    {
        const int NUMBER_OF_REPETITIONS = 1000000;
        const int MIN_FIBO = 37;
        const int MAX_FIBO = 49;
        static void Main(string[] args)
        {
            var ll = new ConcurrentQueue<Tuple<int, int>>();

            var a = new int[MAX_FIBO - MIN_FIBO];
            for (int i = MIN_FIBO; i < MAX_FIBO; i++)
            {
                a[i - MIN_FIBO] = i;
            }

            long elapsed = time_call(() =>
            {
                Parallel.ForEach(a, (n) =>
                {
                    for (int i = 0; i < NUMBER_OF_REPETITIONS; i++)
                    {
                        ll.Enqueue(new Tuple<int, int>(n, fibonacci(n)));
                    }
                });
            });

            Console.WriteLine("TPL C# elapsed time: " + elapsed + "\n\r");

            Console.ReadLine();
        }

        static long time_call(Action f)
        {
            var p = Stopwatch.StartNew();
            p.Start();
            f();
            p.Stop();
            return p.ElapsedMilliseconds;
        }

        static int fibonacci(int n)
        {
            int curr = 1, prev = 0, total = 0;
            for (int i = 0; i < n; i++)
            {
                int pc = curr;
                curr += prev;
                total += curr;
                prev = pc;
            }
            return total;
        }
    }
}

对37至49之间的数字进行1 200万斐波纳契计算的平均时间：

C++：513 C++

C#：2527 C#

票数 5

Stack Overflow用户

发布于 2012-01-03 14:14:23

用于测量本地传递时间的GetTickCount (http://msdn.microsoft.com/en-us/library/windows/desktop/ms724408%28v=vs.85%29.aspx)函数根本不精确。描述是这样说的：

GetTickCount函数的分辨率仅限于系统定时器的分辨率，通常在10毫秒到16毫秒之间。

根据我使用GetSystemTime的经验，windows和up都会产生更好的效果(如果我没记错的话，win XP的分辨率与滴答计数差不多)。或者更好的是，您可以使用一些提供子mils分辨率的othet来进行细粒度测量。在您的例子中，构建大型数据集可能会更有帮助，因为有一些有意义的数据。

票数 0

页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://stackoverflow.com/questions/8712242

复制

相似问题

问C# TPL比C++ PPL快？
EN

回答 3

Stack Overflow用户

Stack Overflow用户

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问C# TPL比C++ PPL快？EN

回答 3

Stack Overflow用户

Stack Overflow用户

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问C# TPL比C++ PPL快？
EN