/*++
 *
 * Copyright (c) 2004-2006 Intel Corporation - All Rights Reserved
 *
 * This software program is licensed subject to the BSD License, 
 * available at http://www.opensource.org/licenses/bsd-license.html
 *
 * Abstract: The main routine
 * 
 --*/

#include <stdio.h>
#include "crc.h"
#include "stdlib.h"
#include "string.h"

#pragma warning( disable: 4267 )
//to avod unnecessary warnings when using "touch" variables

/* 
 * the following variables are used for counting cycles and bytes 
 */

extern uint32_t crc_tableil8_o32[256];
extern uint32_t crc_tableil8_o40[256];
extern uint32_t crc_tableil8_o48[256];
extern uint32_t crc_tableil8_o56[256];
extern uint32_t crc_tableil8_o64[256];
extern uint32_t crc_tableil8_o72[256];
extern uint32_t crc_tableil8_o80[256];
extern uint32_t crc_tableil8_o88[256];


/**
 *
 * Routine Description:
 *
 * Computes the CRC32c checksum for the specified buffer.                      
 *
 * Arguments:
 *
 *		p_running_crc - pointer to the initial or final remainder value 
 *						used in CRC computations. It should be set to 
 *						non-NULL if the mode argument is equal to CONT or END
 *		p_buf - the packet buffer where crc computations are being performed
 *		length - the length of p_buf in bytes
 *		init_bytes - the number of initial bytes that need to be procesed before
 *					 aligning p_buf to multiples of 4 bytes
 *		mode - can be any of the following: BEGIN, CONT, END, BODY, ALIGN 
 *
 * Return value:
 *		
 *		The computed CRC32c value
 */

uint32_t
crc32c(
	uint32_t*		p_running_crc,
    const uint8_t*	p_buf,
    const uint32_t	length,
	uint8_t			mode) 
{
	uint32_t crc;
    const uint8_t* p_end = p_buf + length;
	if(mode == MODE_CONT)
		crc = *p_running_crc;
	else	
		crc = CRC32C_INIT_REFLECTED;
	while(p_buf < p_end )
		crc = crc_tableil8_o32[(crc ^ *p_buf++) & 0x000000FF] ^ (crc >> 8);
	if((mode == MODE_BEGIN) || (mode == MODE_CONT))
		return crc;		
	return crc ^ XOROT;

}



/**
 *
 * Routine Description:
 *
 * Computes the CRC32c checksum for the specified buffer using the slicing by 8 
 * algorithm over 64 bit quantities.                      
 *
 * Arguments:
 *
 *		p_running_crc - pointer to the initial or final remainder value 
 *						used in CRC computations. It should be set to 
 *						non-NULL if the mode argument is equal to CONT or END
 *		p_buf - the packet buffer where crc computations are being performed
 *		length - the length of p_buf in bytes
 *		init_bytes - the number of initial bytes that need to be procesed before
 *					 aligning p_buf to multiples of 4 bytes
 *		mode - can be any of the following: BEGIN, CONT, END, BODY, ALIGN 
 *
 * Return value:
 *		
 *		The computed CRC32c value
 */

uint32_t
crc32c_sb8_64_bit(
	uint32_t* p_running_crc,
    const uint8_t*	p_buf,
    const uint32_t length,
	const uint32_t init_bytes,
	uint8_t			mode)
{
	uint32_t li;
	uint32_t crc, term1, term2;
	uint32_t running_length;
	uint32_t end_bytes;
	if(mode ==  MODE_CONT)
		crc = *p_running_crc;
	else	
		crc = CRC32C_INIT_REFLECTED;
	running_length = ((length - init_bytes)/8)*8;
	end_bytes = length - init_bytes - running_length; 

	for(li=0; li < init_bytes; li++) 
		crc = crc_tableil8_o32[(crc ^ *p_buf++) & 0x000000FF] ^ (crc >> 8);	
	for(li=0; li < running_length/8; li++) 
	{
		crc ^= *(uint32_t *)p_buf;
		p_buf += 4;
		term1 = crc_tableil8_o88[crc & 0x000000FF] ^
				crc_tableil8_o80[(crc >> 8) & 0x000000FF];
		term2 = crc >> 16;
		crc = term1 ^
			  crc_tableil8_o72[term2 & 0x000000FF] ^ 
			  crc_tableil8_o64[(term2 >> 8) & 0x000000FF];
		term1 = crc_tableil8_o56[(*(uint32_t *)p_buf) & 0x000000FF] ^
				crc_tableil8_o48[((*(uint32_t *)p_buf) >> 8) & 0x000000FF];
		
		term2 = (*(uint32_t *)p_buf) >> 16;
		crc =	crc ^ 
				term1 ^		
				crc_tableil8_o40[term2  & 0x000000FF] ^	
				crc_tableil8_o32[(term2 >> 8) & 0x000000FF];	
		p_buf += 4;
	}
	for(li=0; li < end_bytes; li++) 
		crc = crc_tableil8_o32[(crc ^ *p_buf++) & 0x000000FF] ^ (crc >> 8);
	if((mode == MODE_BEGIN) || (mode ==  MODE_CONT))
		return crc;		
    return crc ^ XOROT;	
}


/**
 *
 * Routine Description:
 *
 * warms the tables                      
 *
 * Arguments:
 *
 *		none
 *
 * Return value:
 *		
 *		none
 */

void
warm_tables( void )
{
	volatile uint32_t i, touch;
	
	//we warm the small tables
	for(i=0; i < 256; i++)
	{
		touch = crc_tableil8_o32[i];
		touch = crc_tableil8_o40[i];
		touch = crc_tableil8_o48[i];
		touch = crc_tableil8_o56[i];
		touch = crc_tableil8_o64[i];
		touch = crc_tableil8_o72[i];
		touch = crc_tableil8_o80[i];
		touch = crc_tableil8_o88[i];
	}

};

/**
 *
 * Routine Description:
 *
 * reads garbage so that data caches are purged                       
 *
 * Arguments:
 *
 *		none
 *
 * Return value:
 *		
 *		none
 */

void
purge_data_cache( 
				void )
{
	static volatile uint8_t temp;
	// 2* increases the likelihood that all the physical pages
	// in cache were flushed.
	static volatile uint8_t dummy_cache[2 * CPU_DATA_CACHE_SIZE];
	size_t i;

	for( i = 0; i < 2 * CPU_DATA_CACHE_SIZE; i++ )
		temp = dummy_cache[i];
}

/**
 *
 * Routine Description:
 *
 * invalidates a buffer                       
 *
 * Arguments:
 *
 *		the buffer and the buffer size
 *
 * Return value:
 *		
 *		none
 */

void
invalidate_buffer(
	uint8_t* p_buf, 
	uint32_t size)
{
	uint32_t i;
	volatile void const* cache_line;
	for(i=0; i < (size/64); i++)
	{
		cache_line = (void const*)(p_buf+i*64);
		CPU_CACHE_FLUSH(cache_line)
	}
}



/**
 *
 * Routine Description:
 *
 * warms the data buffer                       
 *
 * Arguments:
 *
 *		buf - the buffer
 *		buf_lengh - the length of the buffer
 *
 * Return value:
 *		
 *		none
 */
static void
warm_data(
	const crc_test_t* const p_test,
	uint32_t const num_of_packets )
{
	size_t i;
	size_t j;
	for( i = 0; i < num_of_packets; i++ )
	{
		for( j = 0; j < p_test[i].buf_length; j++)
		{
			volatile uint8_t touch = p_test[i].p_sbuf[j];
		}
	}
}

static void
chill_data(
	const crc_test_t* const p_test,
	uint32_t const num_of_packets )
{
	size_t i;
	for( i = 0; i < num_of_packets; i++ )
	{
		invalidate_buffer( p_test[i].p_sbuf, p_test[i].buf_length );
	}
}

/**
	Warms the specified crc_test object in the cache.
*/
static void
crc_test_warm(
	const crc_test_t* const p_test )
{
	size_t i;
	for( i = 0; i < sizeof(*p_test); i++ )
	{
		volatile uint8_t touch = ((uint8_t*)p_test)[i];
	}
}



static void
set_cache_state(
	crc_test_t* p_test,
	crc_eval_info_t* p_info)
{
/*	purge_data_cache();

	if( p_info->crc_table_status == WARM )
		warm_tables();

	if( p_info->crc_data_status == WARM )
		warm_data( p_test, p_info->crc_num_of_iterations );	
	else
		chill_data( p_test, p_info->crc_num_of_iterations );	

	// warm the test object itself
	crc_test_warm( p_test );*/
}

/**
 *
 * Routine Description:
 *
 * performs the mpa sample frame test                    
 *
 * Arguments:
 *
 *		non
 *
 * Return value:
 *		
 *		none
 */

void
mpa_sample_frame_test(
					void)
{
	uint8_t	mpa[MPA_FRAME_LENGTH];
	uint32_t result;
	int i;
	for(i = 0; i < MPA_FRAME_LENGTH; i++ )
		mpa[i] = 0;
	mpa[MPA_FRAME_INDEX1] = MPA_FRAME_VALUE1;
	mpa[MPA_FRAME_INDEX2] = MPA_FRAME_VALUE2;
	mpa[MPA_FRAME_INDEX3] = MPA_FRAME_VALUE3;
	mpa[MPA_FRAME_INDEX4] = MPA_FRAME_VALUE4;
	printf("\nVerifying algorithms against MPA sample frame\n\n"); 
	
	printf("testing the Sarwate algorithm..............................");
	result = crc32c(NULL, mpa, MPA_FRAME_LENGTH, MODE_BODY);
	if( result != MPA_FRAME_CRC)
	{
		printf( "error\n" );
		exit(0);
	}
	printf( "passed\n" );
	printf("testing the slicing by 8 over 64 bit algorithm.............");
	result = crc32c_sb8_64_bit(NULL, mpa, MPA_FRAME_LENGTH, 0, MODE_BODY);
	if( result != MPA_FRAME_CRC)
	{
		printf( "error\n" );
		exit(0);
	}
	printf( "passed\n" );
	return;
}

#define N_SUMS 64
#define PAGE_SIZE 8192
#define HASH_PRIME 31

uint16_t
checksum_simd_i16(void * __restrict__ page) {
	uint16_t sums[N_SUMS]  __attribute__ ((aligned (__BIGGEST_ALIGNMENT__)));
	uint16_t (*pageArr)[N_SUMS] __attribute__ ((aligned (__BIGGEST_ALIGNMENT__)));
	uint32_t result = 0;
	int i, j;

	pageArr = (uint16_t (*)[N_SUMS]) page;

	for (i = 0; i < N_SUMS; i++)
		sums[i] = 0;

	for (i = 0; i < PAGE_SIZE / sizeof(uint16_t) / N_SUMS; i++) {
		for (j = 0; j < N_SUMS; j++) {
			sums[j] = sums[j] * HASH_PRIME + pageArr[i][j];
		}
	}

	for (i = 0; i < N_SUMS; i++)
		result = result*HASH_PRIME + sums[i];

    return (uint16_t) result;
}

uint32_t multiplyVector[8] =
{ 0x1, 0x1f, 0x3c1, 0x745f, 0x1781, 0xd89f, 0x3b41, 0x2cdf };

uint16_t
checksum_simd_asm (void *page) {
	uint32_t result;
	void *page_end = page + 8192;
	int64_t hash_start = (page - page_end);

	asm(
"mov	$0x80, %%rdi\n" // vector length 

"movdqa (%1,%2,1), %%xmm1\n" // initialize checksum registers to first 8*4*4 bytes
"xor	%%ecx, %%ecx\n" // zero out checksums
"pinsrw $0x2, %%ecx, %%xmm1\n"
"movdqa 0x10(%1,%2,1), %%xmm2\n"
"movdqa 0x20(%1,%2,1), %%xmm3\n"
"movdqa 0x30(%1,%2,1), %%xmm4\n"
"movdqa 0x40(%1,%2,1), %%xmm5\n"
"movdqa 0x50(%1,%2,1), %%xmm6\n"
"movdqa 0x60(%1,%2,1), %%xmm7\n"
"movdqa 0x70(%1,%2,1), %%xmm8\n"
"add    %%rdi, %2\n"

"mov	$0x1F, %%ecx\n" // Init xmm9 to multiplier
"movd	%%ecx, %%xmm9\n"
"pshufd $0x0, %%xmm9, %%xmm9\n"

// main loop, accumulate hash codes in parallel

"loop:\n"
"pmullw  %%xmm9, %%xmm1\n"
"paddw   (%1,%2,1), %%xmm1\n"
"pmullw  %%xmm9, %%xmm2\n"
"paddw   0x10(%1,%2,1), %%xmm2\n"
"pmullw  %%xmm9, %%xmm3\n"
"paddw   0x20(%1,%2,1), %%xmm3\n"
"pmullw  %%xmm9, %%xmm4\n"
"paddw   0x30(%1,%2,1), %%xmm4\n"
"pmullw  %%xmm9, %%xmm5\n"
"paddw   0x40(%1,%2,1), %%xmm5\n"
"pmullw  %%xmm9, %%xmm6\n"
"paddw   0x50(%1,%2,1), %%xmm6\n"
"pmullw  %%xmm9, %%xmm7\n"
"paddw   0x60(%1,%2,1), %%xmm7\n"
"pmullw  %%xmm9, %%xmm8\n"
"paddw   0x80(%1,%2,1), %%xmm8\n"

"add     %%rdi, %2\n"
"jnz loop\n"

"movdqa (%3), %%xmm10\n" // init xmm10 to 0x1F**0, **1, **2 **3
"movdqa %%xmm9, %%xmm11\n" // init xmm11 to 0x1F**4
"pmullw %%xmm11, %%xmm11\n"
"pmullw %%xmm11, %%xmm11\n"

"pmullw %%xmm10, %%xmm8\n" // mul each position with hash multiplier powers and
"pmullw %%xmm11, %%xmm10\n" // and do a tree structured accumulation
"pmullw %%xmm10, %%xmm7\n"
"paddw %%xmm8, %%xmm7\n"
"pmullw %%xmm11, %%xmm10\n"
"pmullw %%xmm10, %%xmm6\n"
"pmullw %%xmm11, %%xmm10\n"
"pmullw %%xmm10, %%xmm5\n"
"paddw %%xmm6, %%xmm5\n"
"pmullw %%xmm11, %%xmm10\n"
"pmullw %%xmm10, %%xmm4\n"
"pmullw %%xmm11, %%xmm10\n"
"pmullw %%xmm10, %%xmm3\n"
"paddw %%xmm4, %%xmm3\n"
"pmullw %%xmm11, %%xmm10\n"
"pmullw %%xmm10, %%xmm2\n"
"pmullw %%xmm11, %%xmm10\n"
"pmullw %%xmm10, %%xmm1\n"
"paddw %%xmm2, %%xmm1\n"
"paddw %%xmm7, %%xmm5\n"
"paddw %%xmm3, %%xmm1\n"
"paddw %%xmm5, %%xmm1\n"
"movdqa %%xmm1, %%xmm2\n"
"psrldq $0x8, %%xmm2\n"
"paddw %%xmm2, %%xmm1\n"
"movdqa %%xmm1, %%xmm2\n"
"psrldq $0x4, %%xmm2\n"
"paddw %%xmm2, %%xmm1\n"
"movdqa %%xmm1, %%xmm2\n"
"psrldq $0x2, %%xmm2\n"
"paddw %%xmm2, %%xmm1\n"
"movd %%xmm1, %0\n"

: "=r"(result)
: "r"(page_end), "r"(hash_start), "r"(multiplyVector)
: "%ecx","%edi"
);
    result &= 0xFFFF;
	return ((result / 255) % 255) << 8 | (result % 255);
}

uint16_t
checksum_fletcher (void *page) {
    uint32_t *p32Page = (uint32_t*) page;
    uint64_t sum1 = 0;
    uint64_t sum2 = 0;
    uint16_t checksum = 0;
    uint8_t *p8Checksum = (uint8_t *) &checksum;
    int i;

    for (i = 0; i < 8192 / sizeof(uint32_t); i++) {
        sum1 += p32Page[i];
        sum2 += sum1;
    }
    p8Checksum[0] = (sum1 % 255) + 1;
    p8Checksum[1] = (sum2 % 255) + 1;
    
    return checksum;
}

uint16_t
checksum_fletcher_unroll (void *page) {
    uint32_t *p32Page = (uint32_t*) page;
    uint64_t sum1 = 0;
    uint64_t sum2 = 0;
    uint16_t checksum = 0;
    uint8_t *p8Checksum = (uint8_t *) &checksum;
    int i;

    for (i = 0; i < 8192 / sizeof(uint32_t); i+=8) {
        sum1 += p32Page[i];
        sum2 += sum1;
        sum1 += p32Page[i+1];
        sum2 += sum1;
        sum1 += p32Page[i+2];
        sum2 += sum1;
        sum1 += p32Page[i+3];
        sum2 += sum1;
        sum1 += p32Page[i+4];
        sum2 += sum1;
        sum1 += p32Page[i+5];
        sum2 += sum1;
        sum1 += p32Page[i+6];
        sum2 += sum1;
        sum1 += p32Page[i+7];
        sum2 += sum1;
    }
    p8Checksum[0] = (sum1 % 255) + 1;
    p8Checksum[1] = (sum2 % 255) + 1;
    
    return checksum;
}


#include "crc32.c"

/**
 *
 * Routine Description:
 *
 * The main routine
 *
 * Arguments:
 *
 *		argc, argv
 *
 * Return value:
 *		
 *		none
 */

#define WARM_LOCAL_VARS			\
	touch = value1;				\
	touch = value2;				\
	touch = before;				\
	touch = after;				\
	touch = mode;				\
	touch = cycles;				\
	touch = total_bytes;		\
	touch = (uint32_t)mode;


int
main (
	int argc, 
	char* argv[])
{
	uint32_t i, j;
	uint8_t help_requested = FALSE;
	uint8_t alignment = 1;
	uint8_t mode=0;
	volatile uint32_t touch;
	uint32_t packet_size = 0;
	uint64_t cycles = 0;
	volatile uint32_t value1 = 0, value2 = 0;
	uint64_t before = 0;
	uint64_t after = 0;
	size_t total_bytes = 0;
	crc_eval_info_t* p_info;
	crc_test_t* p_test;

	p_info = (crc_eval_info_t *)malloc(sizeof(struct crc_eval_info));
	memset((void *)p_info, 0, sizeof(struct crc_eval_info));
	p_info->crc_table_status = INIT_TABLE_STATUS;		
    p_info->crc_data_status  = INIT_DATA_STATUS;		
    p_info->crc_num_of_iterations = INIT_NUM_OF_ITERATIONS;
	p_info->crc_packet_size = INIT_PACKET_SIZE;
	p_info->crc_iteration_style = INIT_ITERATION_STYLE;
	p_info->crc_alignment = INIT_ALIGNMENT;
	p_info->crc_alignment_style = INIT_ALIGNMENT_STYLE;
	p_info->num_tests = INIT_NUM_TESTS;

	printf("\n");
	printf("Evaluation Suite for CRC Generation Algorithms\n" );
	printf("Intel Research and Development, Intel Corporation.\n\n" ); 
	printf("All results are the confidential property of Intel Corporation.\n\n" );
	printf("Run the program with the -help option for documentation.\n" ); 
	printf("------------------------------------------------------------\n" );

	argv++;
    argc--;
    while(argc >0) 
	{	
		if (**argv != '-') 
		{ 
			printf("invalid option format: option %s must begin with \"-\"\n", *argv);
			exit(0);	
		}
		else if(!strcmp(*argv+1, "t")) 
		{
			argv++;
			argc--;
			if(!strcmp(*argv, "warm"))
				p_info->crc_table_status = WARM;
			else if (!strcmp(*argv, "cold"))
				p_info->crc_table_status = COLD;
			else 
			{
				printf("invalid option %s\n", *argv);
				exit(0);
			}
			if(argc > 0) 
			{
				argv++;
				argc--;
			}
			else
				break;
		}
		else if(!strcmp(*argv+1, "d")) 
		{
			argv++;
			argc--;
			if(!strcmp(*argv, "warm"))
				p_info->crc_data_status = WARM;
			else if (!strcmp(*argv, "cold"))
				p_info->crc_data_status = COLD;
			else 
			{
				printf("invalid option %s\n", *argv);
				exit(0);
			}
			if(argc > 0) 
			{
				argv++;
				argc--;
			}
			else
				break;
		}
		else if(!strcmp(*argv+1, "i")) 
		{
			argv++;
			argc--;
			if(argc && (**argv != '-')) 
			{
				p_info->crc_num_of_iterations = atoi(*argv);
				if(argc) 
				{
					argv++;
					argc--;
				}
			}
			else 
			{
				printf("invalid offset: %s after option -o\n", *argv);
				exit(0);
			}    
		 }
		else if(!strcmp(*argv+1, "n")) 
		{
			argv++;
			argc--;
			if(argc && (**argv != '-')) 
			{
				p_info->num_tests = atoi(*argv);
				if(argc) 
				{
					argv++;
					argc--;
				}
			}
			else 
			{
				printf("invalid test size: %s after option -o\n", *argv);
				exit(0);
			}    
		 }
		 else if(!strcmp(*argv+1, "p")) 
		 {
			argv++;
			argc--;
			if(argc && (**argv != '-')) 
			{
				p_info->crc_packet_size = atoi(*argv);
				if(argc) 
				{
					argv++;
					argc--;
				}
			}
			else
			{
				printf("invalid width: %s after option -w\n", *argv);
				exit(0);
			}    
		}
		else if(!strcmp(*argv+1, "a")) 
		 {
			argv++;
			argc--;
			if(argc && (**argv != '-')) 
			{
				p_info->crc_alignment = (uint8_t)atoi(*argv);
				if(argc) 
				{
					argv++;
					argc--;
				}
			}
			else
			{
				printf("invalid width: %s after option -w\n", *argv);
				exit(0);
			}    
		}
		else if(!strcmp(*argv+1, "is")) 
		{
			argv++;
			argc--;
			if(!strcmp(*argv, "random"))
				p_info->crc_iteration_style = RANDOM;
			else if (!strcmp(*argv, "const"))
				p_info->crc_iteration_style = CONSTANT;
			else if (!strcmp(*argv, "incremental"))
				p_info->crc_iteration_style = INCREMENTAL;
			else 
			{
				printf("invalid option %s\n", *argv);
				exit(0);
			}
			if(argc > 0) 
			{
				argv++;
				argc--;
			}
			else
				break;
		}
		else if(!strcmp(*argv+1, "as")) 
		{
			argv++;
			argc--;
			if(!strcmp(*argv, "random"))
				p_info->crc_alignment_style = RANDOM;
			else if (!strcmp(*argv, "const"))
				p_info->crc_alignment_style = CONSTANT;
			else if (!strcmp(*argv, "incremental"))
				p_info->crc_alignment_style = INCREMENTAL;
			else 
			{
				printf("invalid option %s\n", *argv);
				exit(0);
			}
			if(argc > 0) 
			{
				argv++;
				argc--;
			}
			else
				break;
		}
		else if(!strcmp(*argv+1, "help")) 
		{
			help_requested = TRUE;	
			printf("options:\n");
			printf("-t	warm/cold\n");
			printf("	specifies the table status\n");
			printf("-d	warm/cold\n");
			printf("	specifies the data status\n");	
			printf("-i	num_of_iterations\n");
			printf("	sets the number of iterations of the CRC generation tests\n");
			printf("-n	num_tests\n");
			printf("	sets the number of times to run the CRC generation tests\n");
			printf("-p	packet_size\n");
			printf("	sets the size of the packets for which CRC is generated\n");
			printf("-a	alignment (between 1 and 64)\n");
			printf("	sets the number of initial bytes that are not aligned\n");
			printf("-is	const/random/incremental n\n");
			printf("	sets iteration style\n");
			printf("-as	const/random\n");
			printf("  	sets the alignment style\n");
			if(argc) 
			{
				 argv++;
				 argc--;
			}
		}
	}

	if(help_requested == TRUE) 
	{
		exit(0);
	}

	if(p_info->crc_table_status == WARM) 
		printf("Table Status, WARM\n");
 	else
		printf("Table Status, COLD\n");
	if(p_info->crc_data_status == WARM) 
		printf("Data Status, WARM\n");
 	else
		printf("Data Status, COLD\n");	
	printf("Number of Iterations, %d\n", p_info->crc_num_of_iterations);
	printf("Number of tests, %d\n", p_info->num_tests);
	printf("Packet Size (bytes), %d\n", p_info->crc_packet_size);
	if(p_info->crc_iteration_style == CONSTANT) 
		printf("Iteration Style, CONSTANT\n");
 	else if(p_info->crc_iteration_style == RANDOM) 
		printf("Iteration Style, RANDOM\n");
	else  
		printf("Iteration Style, INCREMENTAL\n");
	printf("Alignment (bytes), %d\n", p_info->crc_alignment);
	if(p_info->crc_alignment_style == CONSTANT) 
		printf("Alignment Style, CONSTANT\n");
 	else if(p_info->crc_alignment_style == RANDOM) 
		printf("Alignment Style, RANDOM\n");
	else  
		printf("Alignment Style, INCREMENTAL\n");
	printf("\n");

	//we begin with setting the packet size and alignment
	packet_size = p_info->crc_packet_size;
	alignment	= p_info->crc_alignment;
	if((alignment == 1) && (p_info->crc_alignment_style == CONSTANT))
		mode = MODE_BODY;
	else
		mode = MODE_ALIGN;

	//second, we perform the mpa sample frame test		
	mpa_sample_frame_test();

	// allocate an array of test objects
	
	p_test = (crc_test_t *)malloc(p_info->crc_num_of_iterations * sizeof(crc_test_t));
	memset((void *)p_test, 0, p_info->crc_num_of_iterations * sizeof(crc_test_t));


	//next, we allocate memory for all packet buffers
	for( i = 0; i < p_info->crc_num_of_iterations; i++)
	{	
		total_bytes += packet_size;
		p_test[i].buf_length = packet_size;
		p_test[i].p_sbuf = (uint8_t *)malloc(packet_size * sizeof(uint8_t));
		p_test[i].p_dbuf = (uint8_t *)malloc(packet_size * sizeof(uint8_t));

		//set the data to be random numbers
		for( j = 0; j < p_test[i].buf_length; j++)
			p_test[i].p_sbuf[j] = (uint8_t)rand();

		// compute the reference CRC used to validate other algos
		p_test[i].crc_value = 
			crc32c( NULL, p_test[i].p_sbuf, p_test[i].buf_length, MODE_BODY);

		// Set the packet size for the next iteration.
		switch( p_info->crc_iteration_style )
		{
		case INCREMENTAL:
			packet_size += PACKET_SIZE_INCREMENT;
			if(packet_size >= MAX_BUF_SIZE)
				packet_size = MIN_BUF_SIZE;
			break;

		case RANDOM:
			packet_size = MIN_BUF_SIZE + (rand() % (MAX_BUF_SIZE - MIN_BUF_SIZE) );
			break;

		default:
			// Nothing to do
			break;
		}

		p_test[i].crc_status = CRC_PASSED; 

		switch( p_info->crc_alignment_style )
		{
		case INCREMENTAL:
			p_test[i].alignment += ALIGNMENT_INCREMENT;
			if(p_test[i].alignment >= MAX_ALIGNMENT)
				p_test[i].alignment = MIN_ALIGNMENT;
			break;

		case RANDOM:
			p_test[i].alignment = (uint8_t)(MIN_ALIGNMENT + (rand() % (MAX_ALIGNMENT - MIN_ALIGNMENT)));
			break;

		default:
			// Nothing to do.
			break;
		}
	}
	

	//we test the slice by 8
	cycles = 0;
	for ( j = 0; j < p_info->num_tests; j++) {
	    set_cache_state( p_test, p_info );
	    WARM_LOCAL_VARS;
	    CPU_SYNC;
	    CPU_GET_CYCLES( before );

	    for( i = 0; i < p_info->crc_num_of_iterations; i++ )
	    {	
		    value1 = crc32c_sb8_64_bit(NULL, p_test[i].p_sbuf,
			    p_test[i].buf_length, p_test[i].alignment, mode);
	    }

	    CPU_SYNC;
	    CPU_GET_CYCLES(after);
	    cycles += after - before;
    }
    
	printf("CRC32 slicing by 8 Algorithm (bytes/cycle), %f\n", 
		1.0/((float)(cycles)/(float)(total_bytes * p_info->num_tests)));
    
	cycles = 0;
	for ( j = 0; j < p_info->num_tests; j++) {
	    set_cache_state( p_test, p_info );
	    WARM_LOCAL_VARS;
	    CPU_SYNC;
	    CPU_GET_CYCLES( before );

	    for( i = 0; i < p_info->crc_num_of_iterations; i++ )
	    {
	        value1 = crc32(0, p_test[i].p_sbuf, p_test[i].buf_length);
	    }

	    CPU_SYNC;
	    CPU_GET_CYCLES(after);
	    cycles += after - before;
    }
    
	printf("CRC32 zlib (bytes/cycle), %f\n", 
		1.0/((float)(cycles)/(float)(total_bytes * p_info->num_tests)));
    
	cycles = 0;
	for ( j = 0; j < p_info->num_tests; j++) {
	    set_cache_state( p_test, p_info );
	    WARM_LOCAL_VARS;
	    CPU_SYNC;
	    CPU_GET_CYCLES( before );

	    for(i=0; i < p_info->crc_num_of_iterations; i++)
	    {	
		    value2 = checksum_fletcher(p_test[i].p_sbuf);
	    }


	    CPU_SYNC;
	    CPU_GET_CYCLES(after);
	    cycles += after - before;
    }

	printf("Fletcher Algorithm: (bytes/cycle), %f\n", 
		1.0/((float)(cycles)/(float)(total_bytes * p_info->num_tests)));

	cycles = 0;
	for ( j = 0; j < p_info->num_tests; j++) {
	    set_cache_state( p_test, p_info );
	    WARM_LOCAL_VARS;
	    CPU_SYNC;
	    CPU_GET_CYCLES( before );

	    for(i=0; i < p_info->crc_num_of_iterations; i++)
	    {	
		    value2 = checksum_fletcher_unroll(p_test[i].p_sbuf);
	    }


	    CPU_SYNC;
	    CPU_GET_CYCLES(after);
	    cycles += after - before;
    }

	printf("Fletcher Algorithm hand unrolled: (bytes/cycle), %f\n", 
		1.0/((float)(cycles)/(float)(total_bytes * p_info->num_tests)));


	cycles = 0;
	for ( j = 0; j < p_info->num_tests; j++) {
	    set_cache_state( p_test, p_info );
	    WARM_LOCAL_VARS;
	    CPU_SYNC;
	    CPU_GET_CYCLES( before );

	    for(i=0; i < p_info->crc_num_of_iterations; i++)
	    {	
		    value2 = checksum_simd_i16(p_test[i].p_sbuf);
	    }


	    CPU_SYNC;
	    CPU_GET_CYCLES(after);
	    cycles += after - before;
    }

	printf("SIMD Algorithm (gcc): (bytes/cycle), %f\n", 
		1.0/((float)(cycles)/(float)(total_bytes * p_info->num_tests)));

	cycles = 0;
	for ( j = 0; j < p_info->num_tests; j++) {
	    set_cache_state( p_test, p_info );
	    WARM_LOCAL_VARS;
	    CPU_SYNC;
	    CPU_GET_CYCLES( before );

	    for(i=0; i < p_info->crc_num_of_iterations; i++)
	    {	
		    value2 = checksum_simd_asm(p_test[i].p_sbuf);
	    }


	    CPU_SYNC;
	    CPU_GET_CYCLES(after);
	    cycles += after - before;
    }

	printf("SIMD Algorithm (hand coded): (bytes/cycle), %f\n", 
		1.0/((float)(cycles)/(float)(total_bytes * p_info->num_tests)));

}
