After the MVP Summit in Seattle, I started to dig into the Parallel FX framework (currently under a CTP available here). In just a few words, the framework is composed of:

  • Task Parallel Library (TPL), that provides means to manage task (i.e. units of execution), and exposes a
  • TPL API, represented by the static function in the Parallel class: For, ForEach and Do
  • Parallel LINQ (PLINQ): allows parallelization of LINQ queries (AsParallel)

Since I have a 2 codes CPU (AMP Athlon 64 X2 Dual) I wanted to see how much performance do I actually gain by parallelizing loops. So I benchmarked matrices multiplication and arrays sorting.

Matrices Multiplication

Having this Matrix class:

class Matrix< T >
{
    private T[,] _values;

    public int Rows { get; private set; }
    public int Columns { get; private set; }

    public T this[int row, int column]
    {
        get { return _values[row, column]; }
        set { _values[row, column] = value; }
    }

    public Matrix(int rows, int columns)
    {
        if (rows < 1) throw new ArgumentOutOfRangeException("rows");
        if (columns < 1) throw new ArgumentOutOfRangeException("columns");
        Rows = rows;
        Columns = columns;
        _values = new T[rows, columns];
    }
}

I could write the multiplication routine like this:

public static Matrix< double > MultiplySequential(Matrix< double> m1, Matrix< double > m2)
{
    if (m1.Columns != m2.Rows)
         throw new ArgumentException("incorrect matrix size", "m2");

    Matrix< double > result = new Matrix< double >(m1.Rows, m2.Columns);
    for (int i = 0; i < m1.Rows; i++)
    {
        for (int j = 0; j < m2.Columns; j++)
        {
            result[i, j] = 0;
            for (int k = 0; k < m1.Columns; k++)
            {
                result[i, j] += m1[i, k] * m2[k, j];
            }
         }
    }
    return result;
}

Modifying the function to use Parallel.For was pretty simple, since this is very straight forward:

public static Matrix< double > MultiplyParallel(Matrix< double> m1, Matrix< double > m2)
{
    if (m1.Columns != m2.Rows)
        throw new ArgumentException("incorrect matrix size", "m2");

    Matrix< double > result = new Matrix< double >(m1.Rows, m2.Columns);
    Parallel.For(
        0,
        m1.Rows,
        i =>
        {
            for (int j = 0; j < m2.Columns; j++)
            {
                result[i, j] = 0;
                for (int k = 0; k < m1.Columns; k++)
                {
                    result[i, j] += m1[i, k] * m2[k, j];
                }
            }
        });

    return result;
}

I used matrices of various sizes, started from 100x100 to 1000x1000 and compared the execution time for the sequential version or the parallel version.

for(int SIZE = 100; SIZE <= 1000; SIZE += 100)
{
    Console.WriteLine("Matrices size: {0}x{0}", SIZE);

    Matrix< double > m1 = MatrixOperations.GenerateRandom(SIZE, SIZE);
    Matrix< double > m2 = MatrixOperations.GenerateRandom(SIZE, SIZE);

    Console.Write("Sequential...\t");
    DateTime starts = DateTime.Now;
    Matrix< double > ms = MatrixOperations.MultiplySequential(m1, m2);
    Console.WriteLine("{0}", DateTime.Now - starts);

    Console.Write("Parallel...\t");
    DateTime startp = DateTime.Now;
    Matrix< double > mp = MatrixOperations.MultiplyParallel(m1, m2);
    Console.WriteLine("{0}", DateTime.Now - startp);
}

The results were:

Matrices size: 100x100
Sequential...   00:00:00.0625000
Parallel...     00:00:00.0937500
Matrices size: 200x200
Sequential...   00:00:00.4531250
Parallel...     00:00:00.2343750
Matrices size: 300x300
Sequential...   00:00:01.4843750
Parallel...     00:00:00.7500000
Matrices size: 400x400
Sequential...   00:00:03.7812500
Parallel...     00:00:02.0625000
Matrices size: 500x500
Sequential...   00:00:07.8750000
Parallel...     00:00:04.1406250
Matrices size: 600x600
Sequential...   00:00:14.3437500
Parallel...     00:00:07.2656250
Matrices size: 700x700
Sequential...   00:00:23.4531250
Parallel...     00:00:11.7812500
Matrices size: 800x800
Sequential...   00:00:36.1250000
Parallel...     00:00:18.4218750
Matrices size: 900x900
Sequential...   00:00:51.5312500
Parallel...     00:00:25.8906250
Matrices size: 1000x1000
Sequential...   00:01:11.5156250
Parallel...     00:00:35.6875000

They show that the time for executing the parallel version was only half the time for the sequential multiplication (remember that I have 2 cores). That means reducing the execution time, not 20 or 30%, but 50%, basically the maximum possible.

Array Sorting

For sorting the arrays I tested with the wost sorting algorithm: bubblesort. Here is the implementation:

interface ISort< T >
{
    void SortSequential(T[] array);
    void SortParallel(T[] array);
}

class BubbleSort< T > : ISort< T >
    where T: IComparable
{
    public void SortSequential(T[] array)
    {
        for(int i = 0; i < array.Length; ++i)
        {
            for(int j = 0; j < array.Length; ++j)
            {
                if(array[i].CompareTo(array[j]) < 0)
                {
                    T aux = array[i];
                    array[i] = array[j];
                    array[j] = aux;
                }
            }
        }
    }

    public void SortParallel(T[] array)
    {
        Parallel.For(
            0,
            array.Length,
            i =>
            {
                for (int j = 0; j < array.Length; ++j)
                {
                    if (array[i].CompareTo(array[j]) < 0)
                    {
                        T aux = array[i];
                        array[i] = array[j];
                        array[j] = aux;
                    }
                }
            });
    }
}

First, I tested that with small array, 100 to 1000 elements.

int STEP = 100;
for (int SIZE = STEP; SIZE <= STEP*20; SIZE += STEP)
{
    double[] array = GenerateRandomDoubles(SIZE);

    ISort< double > sorter = new BubbleSort< double >();

    Console.WriteLine("Array size: {0}", SIZE);

    Console.Write("Sequential...\t");
    DateTime starts = DateTime.Now;
    sorter.SortSequential(array);
    Console.WriteLine("{0}", DateTime.Now - starts);

    Console.Write("Parallel...\t");
    DateTime startp = DateTime.Now;
    sorter.SortParallel(array);
    Console.WriteLine("{0}", DateTime.Now - startp);
}

And the results were:

Array size: 100
Sequential...   00:00:00
Parallel...     00:00:00.0781250
Array size: 200
Sequential...   00:00:00
Parallel...     00:00:00
Array size: 300
Sequential...   00:00:00
Parallel...     00:00:00
Array size: 400
Sequential...   00:00:00
Parallel...     00:00:00
Array size: 500
Sequential...   00:00:00.0156250
Parallel...     00:00:00
Array size: 600
Sequential...   00:00:00
Parallel...     00:00:00.0156250
Array size: 700
Sequential...   00:00:00
Parallel...     00:00:00.0156250
Array size: 800
Sequential...   00:00:00.0156250
Parallel...     00:00:00
Array size: 900
Sequential...   00:00:00.0156250
Parallel...     00:00:00.0156250
Array size: 1000
Sequential...   00:00:00.0312500
Parallel...     00:00:00.0156250
Array size: 1100
Sequential...   00:00:00.0156250
Parallel...     00:00:00.0156250
Array size: 1200
Sequential...   00:00:00.0312500
Parallel...     00:00:00.0312500
Array size: 1300
Sequential...   00:00:00.0312500
Parallel...     00:00:00.0156250
Array size: 1400
Sequential...   00:00:00.0468750
Parallel...     00:00:00.0312500
Array size: 1500
Sequential...   00:00:00.0468750
Parallel...     00:00:00.0312500
Array size: 1600
Sequential...   00:00:00.0468750
Parallel...     00:00:00.0312500
Array size: 1700
Sequential...   00:00:00.0625000
Parallel...     00:00:00.0468750
Array size: 1800
Sequential...   00:00:00.0625000
Parallel...     00:00:00.0468750
Array size: 1900
Sequential...   00:00:00.0781250
Parallel...     00:00:00.0468750
Array size: 2000
Sequential...   00:00:00.0937500
Parallel...     00:00:00.0468750

Then, I changed the for loop, so that it creates array from 5000 to 50000 elements. The results were:

Array size: 5000
Sequential...   00:00:00.5312500
Parallel...     00:00:00.4062500
Array size: 10000
Sequential...   00:00:02.0468750
Parallel...     00:00:01.2500000
Array size: 15000
Sequential...   00:00:04.5781250
Parallel...     00:00:02.7187500
Array size: 20000
Sequential...   00:00:08.1562500
Parallel...     00:00:04.7187500
Array size: 25000
Sequential...   00:00:12.7343750
Parallel...     00:00:07.5625000
Array size: 30000
Sequential...   00:00:18.5468750
Parallel...     00:00:10.6406250
Array size: 35000
Sequential...   00:00:25.4687500
Parallel...     00:00:14.5468750
Array size: 40000
Sequential...   00:00:33.6562500
Parallel...     00:00:18.9531250
Array size: 45000
Sequential...   00:00:42.7968750
Parallel...     00:00:23.8906250
Array size: 50000
Sequential...   00:00:52.9218750
Parallel...     00:00:29.4687500

The conclusion is that the parallel sorting is not efficient when the array size is less than 1000 elements; in other words there is no performance gain, but perhaps performance lost due to overhead of creating tasks and executing them and managing the context switching. The same conclusion is expressed in the framework's documentation:

Target areas of your program where algorithms are computationally expensive and/or data sets are large (e.g. > 1000 elements). In such cases, there will likely be benefit to using parallelism.

That means, that when you parallelize an algorithm, you should take into consideration the size of the data structures it performs on, and unless a given threshold is not reached, a non-parallelized version should be used.

Hits for this post: 11670 .
Trackback

no comment untill now

Add your comment now