Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Why do more threads take longer? #194

Open
csrdxbb opened this issue Dec 5, 2020 · 0 comments
Open

Why do more threads take longer? #194

csrdxbb opened this issue Dec 5, 2020 · 0 comments

Comments

@csrdxbb
Copy link

csrdxbb commented Dec 5, 2020

Today I test the NNPACK in ARMv8 machine, I found when I improve the thread number, the time increases.
I am very confused and I am no sure what is the problem, the script just like this:

#include <iostream>
#include <sys/time.h>
				
#include <vector>
#include "nnpack.h"

using namespace std;

float test_nnpack(size_t bs, size_t threads)
{
	enum nnp_status init_status = nnp_initialize();
	if (init_status != nnp_status_success)
	{
		return 0;
	}
 
	enum nnp_convolution_algorithm algorithm;
	enum nnp_convolution_transform_strategy strategy=nnp_convolution_transform_strategy_tuple_based;
	const size_t batch_size = 1;
	const size_t input_channels = 16;
	const size_t output_channels = 16;
        const size_t kernel_num = 3;
	const struct nnp_padding input_padding = {1, 1, 1, 1};
	const struct nnp_size input_size = {224, 224};
	const struct nnp_size kernel_size = {3, 3};
	const struct nnp_size stride = {.width=1, .height=1};
	const struct nnp_size output_size = {
		.width = (input_padding.left + input_size.width + input_padding.right - kernel_size.width)/stride.width + 1,
		.height = (input_padding.top + input_size.height + input_padding.bottom - kernel_size.height)/stride.height + 1
	};
	float *input, *kernel, *output, *bias;
 
	
	input = (float *)malloc(batch_size * input_channels * input_size.height * input_size.width * sizeof(float));
	kernel = (float *)malloc(input_channels * output_channels * kernel_size.height * kernel_size.width * sizeof(float));
	output = (float *)malloc(batch_size * output_channels * output_size.height * output_size.width * sizeof(float));
	bias = (float *)malloc(output_channels * sizeof(float));
	
	pthreadpool_t threadpool = nullptr;
        if (true) {   
            threadpool = pthreadpool_create(32);  
            printf("Threads: %zu\n", pthreadpool_get_threads_count(threadpool));  
        } 	
 
	struct nnp_profile computation_profile;
	
	int i, j, c, iter;
	struct timeval start, end;
	
	for (c = 0; c < input_channels; c++)
	{
		for (i = 0; i < input_size.height; i++)
		{
			for (j = 0; j < input_size.width; j++)
			{
				input[c * input_size.height * input_size.width + i * input_size.width + j] = (i * input_size.width + j) * 0.1;
			}
		}
	}
	
	
	for(i = 0; i < output_channels; i++)
	{
		for (j = 0; j < input_channels * kernel_size.height * kernel_size.width; j++)
		{
			kernel[i * input_channels * kernel_size.height * kernel_size.width + j] = 0.1;
		}
	}
	
	
	for (i = 0; i < output_channels; i++)
	{
		bias[i] = 1.0;
	}
	
        iter = 1;
	gettimeofday(&start, nullptr); 
	for (i = 0; i < iter; i++)
	{
		algorithm = nnp_convolution_algorithm_wt8x8;
		nnp_convolution_output(algorithm,
                                  batch_size,
                                  input_channels,
                                  output_channels,
                                  input_size,
                                  input_padding,
                                  kernel_size,
                                  input,
                                  kernel,
                                  bias,
                                  output,
                                  threadpool,
                                  nullptr);
	}
	gettimeofday(&end, nullptr);
	long second = end.tv_sec - start.tv_sec;
	long usecond = end.tv_usec - start.tv_usec;
	float mtime = (second * 1000 + usecond / 1000.0);
	cout << "Winograd convolution elapsed time:" << mtime << "ms"  << endl;
        cout << output[10] << endl;
	
	gettimeofday(&start, nullptr);
	for (i = 0; i < iter; i++)
	{
		algorithm = nnp_convolution_algorithm_ft8x8;
		nnp_convolution_output(algorithm,
                                  batch_size,
                                  input_channels,
                                  output_channels,
                                  input_size,
                                  input_padding,
                                  kernel_size,
                                  input,
                                  kernel,
                                  bias,
                                  output,
                                  threadpool,
                                  nullptr);
	}
	gettimeofday(&end, nullptr);
	second = end.tv_sec - start.tv_sec;
	usecond = end.tv_usec - start.tv_usec;
	mtime = (second * 1000 + usecond / 1000.0);
	cout << "FFT8x8 convolution elapsed time:" << mtime << "ms"  << endl;
        cout << output[10] << endl;        
	
	gettimeofday(&start, nullptr);
	for (i = 0; i < iter; i++)
	{
		algorithm = nnp_convolution_algorithm_implicit_gemm;
		nnp_convolution_output(algorithm,
                                  batch_size,
                                  input_channels,
                                  output_channels,
                                  input_size,
                                  input_padding,
                                  kernel_size,
                                  input,
                                  kernel,
                                  bias,
                                  output,
                                  threadpool,
                                  nullptr);
	}
	gettimeofday(&end, nullptr);
	second = end.tv_sec - start.tv_sec;
	usecond = end.tv_usec - start.tv_usec;
	mtime = (second * 1000 + usecond / 1000.0);
	cout << "GEMM convolution elapsed time:" << mtime << "ms"  << endl;
        cout << output[10] << endl;	

	gettimeofday(&start, nullptr);
	for (i = 0; i < iter; i++)
	{
		algorithm = nnp_convolution_algorithm_direct;
		nnp_convolution_output(algorithm,
                                  batch_size,
                                  input_channels,
                                  output_channels,
                                  input_size,
                                  input_padding,
                                  kernel_size,
                                  input,
                                  kernel,
                                  bias,
                                  output,
                                  threadpool,
                                  nullptr);
	}
	gettimeofday(&end, nullptr);
	second = end.tv_sec - start.tv_sec;
	usecond = end.tv_usec - start.tv_usec;
	mtime = (second * 1000 + usecond / 1000.0);
	cout << "Direct convolution elapsed time:" << mtime << "ms"  << endl;
        cout << output[10] << endl;	

	return 0;
}
int main(int argc, char* argv[])
{
        size_t batch_size = atoi(argv[1]);
        size_t thread_num = atoi(argv[2]);
	test_nnpack(batch_size,thread_num);
	return 0;
}

My machine are numa arch, however I am sure the 32 threads run on same node so there is no numa remote access issue.
Please tell me how to improve the time ? Thanks!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant