Open main menu

CDOT Wiki β

Difference between revisions of "Assorted Algorithm Alliteration"

(Created page with '{{GPU610/DPS915 Index | 20123}} = Assorted Algorithm Alliteration = == Team Members == # [mailto:mddelacruz1@myseneca.ca?subject=GPU610 Mark de la Cruz] # [mailto:elim2@mysene…')
 
 
(3 intermediate revisions by 2 users not shown)
Line 5: Line 5:
 
== Team Members ==  
 
== Team Members ==  
 
# [mailto:mddelacruz1@myseneca.ca?subject=GPU610 Mark de la Cruz]
 
# [mailto:mddelacruz1@myseneca.ca?subject=GPU610 Mark de la Cruz]
# [mailto:elim2@myseneca.ca?subject=GPU610 Edwin Lim]
+
# [mailto:elim2@myseneca.ca?subject=GPU610 Edwin Lim] - MD5 Checksum Calculator
# [mailto:mdafidchao@myseneca.ca?subject=GPU610 Michael Afidchao]
+
# [mailto:mdafidchao@myseneca.ca?subject=GPU610 Michael Afidchao] - Game of Life
 
# [mailto:mddelacruz1@myseneca.ca;elim2@myseneca.ca;mdafidchao@myseneca.ca?subject=GPU610 eMail All]
 
# [mailto:mddelacruz1@myseneca.ca;elim2@myseneca.ca;mdafidchao@myseneca.ca?subject=GPU610 eMail All]
  
== Proposal ==
+
===Game of Life - Michael Afidchao===
=== Game of Life ===
 
 
The Game of Life is a "0 player game" cellular automaton. With an initial configuration,
 
The Game of Life is a "0 player game" cellular automaton. With an initial configuration,
 
the game uses a set of rules to determine what happens to the life forms from generation
 
the game uses a set of rules to determine what happens to the life forms from generation
Line 78: Line 77:
 
</pre>
 
</pre>
  
 +
===MD5 Checksum Calculator - Edwin Lim===
 +
The md5 checksum is a commonly used hash function that produces a 128-bit hash value commonly used to check data integrity. It is also used in a wide variety of security applications, however its use in checking data integrity is what will be explored in this assignment.
 +
 +
The full source code for the MD5 checksum calculator can be found here: [http://www.codeforge.com/article/91071]
 +
 +
===Profile===
 +
The following result (external link) was found after attempting to generate the md5 checksum for a 6.4gb file (Windows Vista ISO):
 +
[http://i.imgur.com/B34r9.png]]
 +
 +
===Code Snippet===
 +
From analysis of the above profile, the code to be targetted for optimization is the following:
 +
<pre>
 +
void _MD5_transform
 +
  (unsigned int state[4],
 +
  unsigned char block[64])
 +
{
 +
 +
  unsigned int lA = state[0], lB = state[1], lC = state[2], lD = state[3];
 +
  unsigned int x[16];
 +
 +
  _MD5_decode (x, block, 64);
 +
 +
  // round 1
 +
  FF ( lA, lB, lC, lD, x[ 0], S11, 0xd76aa478); // 1
 +
  FF ( lD, lA, lB, lC, x[ 1], S12, 0xe8c7b756); // 2
 +
  FF ( lC, lD, lA, lB, x[ 2], S13, 0x242070db); // 3
 +
  FF ( lB, lC, lD, lA, x[ 3], S14, 0xc1bdceee); // 4
 +
  FF ( lA, lB, lC, lD, x[ 4], S11, 0xf57c0faf); // 5
 +
  FF ( lD, lA, lB, lC, x[ 5], S12, 0x4787c62a); // 6
 +
  FF ( lC, lD, lA, lB, x[ 6], S13, 0xa8304613); // 7
 +
  FF ( lB, lC, lD, lA, x[ 7], S14, 0xfd469501); // 8
 +
  FF ( lA, lB, lC, lD, x[ 8], S11, 0x698098d8); // 9
 +
  FF ( lD, lA, lB, lC, x[ 9], S12, 0x8b44f7af); // 10
 +
  FF ( lC, lD, lA, lB, x[10], S13, 0xffff5bb1); // 11
 +
  FF ( lB, lC, lD, lA, x[11], S14, 0x895cd7be); // 12
 +
  FF ( lA, lB, lC, lD, x[12], S11, 0x6b901122); // 13
 +
  FF ( lD, lA, lB, lC, x[13], S12, 0xfd987193); // 14
 +
  FF ( lC, lD, lA, lB, x[14], S13, 0xa679438e); // 15
 +
  FF ( lB, lC, lD, lA, x[15], S14, 0x49b40821); // 16
 +
 +
  // round 2
 +
  GG ( lA, lB, lC, lD, x[ 1], S21, 0xf61e2562); // 17
 +
  GG ( lD, lA, lB, lC, x[ 6], S22, 0xc040b340); // 18
 +
  GG ( lC, lD, lA, lB, x[11], S23, 0x265e5a51); // 19
 +
  GG ( lB, lC, lD, lA, x[ 0], S24, 0xe9b6c7aa); // 20
 +
  GG ( lA, lB, lC, lD, x[ 5], S21, 0xd62f105d); // 21
 +
  GG ( lD, lA, lB, lC, x[10], S22,  0x2441453); // 22
 +
  GG ( lC, lD, lA, lB, x[15], S23, 0xd8a1e681); // 23
 +
  GG ( lB, lC, lD, lA, x[ 4], S24, 0xe7d3fbc8); // 24
 +
  GG ( lA, lB, lC, lD, x[ 9], S21, 0x21e1cde6); // 25
 +
  GG ( lD, lA, lB, lC, x[14], S22, 0xc33707d6); // 26
 +
  GG ( lC, lD, lA, lB, x[ 3], S23, 0xf4d50d87); // 27
 +
  GG ( lB, lC, lD, lA, x[ 8], S24, 0x455a14ed); // 28
 +
  GG ( lA, lB, lC, lD, x[13], S21, 0xa9e3e905); // 29
 +
  GG ( lD, lA, lB, lC, x[ 2], S22, 0xfcefa3f8); // 30
 +
  GG ( lC, lD, lA, lB, x[ 7], S23, 0x676f02d9); // 31
 +
  GG ( lB, lC, lD, lA, x[12], S24, 0x8d2a4c8a); // 32
 +
 +
  // round 3
 +
  HH ( lA, lB, lC, lD, x[ 5], S31, 0xfffa3942); // 33
 +
  HH ( lD, lA, lB, lC, x[ 8], S32, 0x8771f681); // 34
 +
  HH ( lC, lD, lA, lB, x[11], S33, 0x6d9d6122); // 35
 +
  HH ( lB, lC, lD, lA, x[14], S34, 0xfde5380c); // 36
 +
  HH ( lA, lB, lC, lD, x[ 1], S31, 0xa4beea44); // 37
 +
  HH ( lD, lA, lB, lC, x[ 4], S32, 0x4bdecfa9); // 38
 +
  HH ( lC, lD, lA, lB, x[ 7], S33, 0xf6bb4b60); // 39
 +
  HH ( lB, lC, lD, lA, x[10], S34, 0xbebfbc70); // 40
 +
  HH ( lA, lB, lC, lD, x[13], S31, 0x289b7ec6); // 41
 +
  HH ( lD, lA, lB, lC, x[ 0], S32, 0xeaa127fa); // 42
 +
  HH ( lC, lD, lA, lB, x[ 3], S33, 0xd4ef3085); // 43
 +
  HH ( lB, lC, lD, lA, x[ 6], S34,  0x4881d05); // 44
 +
  HH ( lA, lB, lC, lD, x[ 9], S31, 0xd9d4d039); // 45
 +
  HH ( lD, lA, lB, lC, x[12], S32, 0xe6db99e5); // 46
 +
  HH ( lC, lD, lA, lB, x[15], S33, 0x1fa27cf8); // 47
 +
  HH ( lB, lC, lD, lA, x[ 2], S34, 0xc4ac5665); // 48
 +
 +
  // round 4
 +
  II ( lA, lB, lC, lD, x[ 0], S41, 0xf4292244); // 49
 +
  II ( lD, lA, lB, lC, x[ 7], S42, 0x432aff97); // 50
 +
  II ( lC, lD, lA, lB, x[14], S43, 0xab9423a7); // 51
 +
  II ( lB, lC, lD, lA, x[ 5], S44, 0xfc93a039); // 52
 +
  II ( lA, lB, lC, lD, x[12], S41, 0x655b59c3); // 53
 +
  II ( lD, lA, lB, lC, x[ 3], S42, 0x8f0ccc92); // 54
 +
  II ( lC, lD, lA, lB, x[10], S43, 0xffeff47d); // 55
 +
  II ( lB, lC, lD, lA, x[ 1], S44, 0x85845dd1); // 56
 +
  II ( lA, lB, lC, lD, x[ 8], S41, 0x6fa87e4f); // 57
 +
  II ( lD, lA, lB, lC, x[15], S42, 0xfe2ce6e0); // 58
 +
  II ( lC, lD, lA, lB, x[ 6], S43, 0xa3014314); // 59
 +
  II ( lB, lC, lD, lA, x[13], S44, 0x4e0811a1); // 60
 +
  II ( lA, lB, lC, lD, x[ 4], S41, 0xf7537e82); // 61
 +
  II ( lD, lA, lB, lC, x[11], S42, 0xbd3af235); // 62
 +
  II ( lC, lD, lA, lB, x[ 2], S43, 0x2ad7d2bb); // 63
 +
  II ( lB, lC, lD, lA, x[ 9], S44, 0xeb86d391); // 64
 +
 +
  state[0] += lA;
 +
  state[1] += lB;
 +
  state[2] += lC;
 +
  state[3] += lD;
 +
 +
  // lClear sensitive information
 +
  memset(x, 0, 16);
 +
}
 +
</pre>
 +
 +
=== Bzip2 - Mark de la Cruz ===
 +
From http://www.bzip.org/: bzip2 is a freely available, patent free (see below), high-quality data compressor. It typically compresses files to within 10% to 15% of the best available techniques (the PPM family of statistical compressors), whilst being around twice as fast at compression and six times faster at decompression.
 +
The full source can be found in the link above. 
 +
 +
Below is the profile and the function to be optimized:
 +
<pre>
 +
 +
Flat profile:
 +
 +
 +
 +
 +
Each sample counts as 0.01 seconds.
 +
 +
  %  cumulative  self              self    total         
 +
 +
time  seconds  seconds    calls  s/call  s/call  name   
 +
 +
63.21      7.01    7.01      23    0.30    0.47  BZ2_compressBlock
 +
 +
28.85    10.21    3.20      23    0.14    0.14  mainSort
 +
 +
  4.78    10.74    0.53        1    0.53    0.53  fallbackSort
 +
 +
  2.34    11.00    0.26    8280    0.00    0.00  default_bzalloc
 +
 +
  0.54    11.06    0.06      23    0.00    0.16  BZ2_blockSort
 +
 +
  0.27    11.09    0.03      552    0.00    0.00  BZ2_hbMakeCodeLengths
 +
 +
  0.00    11.09    0.00  113812    0.00    0.00  add_pair_to_block
 +
 +
  0.00    11.09    0.00    58370    0.00    0.00  _init
 +
 +
  0.00    11.09    0.00    8276    0.00    0.00  BZ2_bzCompress
 +
 +
  0.00    11.09    0.00    4159    0.00    0.00  myfeof
 +
 +
  0.00    11.09    0.00    4158    0.00    0.00  BZ2_bzWrite
 +
 +
  0.00    11.09    0.00      148    0.00    0.00  bsPutUChar
 +
 +
  0.00    11.09    0.00      138    0.00    0.00  BZ2_hbAssignCodes
 +
 +
  0.00    11.09    0.00      24    0.00    0.00  bsPutUInt32
 +
 +
  0.00    11.09    0.00        4    0.00    0.00  default_bzfree
 +
 +
  0.00    11.09    0.00        4    0.00    0.00  hasSuffix
 +
 +
  0.00    11.09    0.00        4    0.00    0.00  testf
 +
 +
  0.00    11.09    0.00        2    0.00    0.00  copyFileName
 +
 +
  0.00    11.09    0.00        2    0.00    0.00  fileExists
 +
 +
  0.00    11.09    0.00        1    0.00    0.00  BZ2_bzCompressEnd
 +
 +
  0.00    11.09    0.00        1    0.00    0.01  BZ2_bzCompressInit
 +
 +
  0.00    11.09    0.00        1    0.00    0.24  BZ2_bzWriteClose64
 +
 +
  0.00    11.09    0.00        1    0.00    0.01  BZ2_bzWriteOpen
 +
 +
  0.00    11.09    0.00        1    0.00    0.00  applySavedFileAttrToOutputFile
 +
 +
  0.00    11.09    0.00        1    0.00    11.09  compressStream
 +
 +
 +
 +
 +
 +
 +
 +
|===============================================================================|
 +
 +
Source code to compressBlock
 +
 +
 +
 +
 +
/*---------------------------------------------------*/
 +
 +
void BZ2_compressBlock ( EState* s, Bool is_last_block )
 +
 +
{
 +
 +
  if (s->nblock > 0) {
 +
 +
 +
 +
 +
      BZ_FINALISE_CRC ( s->blockCRC );
 +
 +
      s->combinedCRC = (s->combinedCRC << 1) | (s->combinedCRC >> 31);
 +
 +
      s->combinedCRC ^= s->blockCRC;
 +
 +
      if (s->blockNo > 1) s->numZ = 0;
 +
 +
 +
 +
 +
      if (s->verbosity >= 2)
 +
 +
        VPrintf4( "    block %d: crc = 0x%08x, "
 +
 +
                  "combined CRC = 0x%08x, size = %d\n",
 +
 +
                  s->blockNo, s->blockCRC, s->combinedCRC, s->nblock );
 +
 +
 +
 +
 +
      BZ2_blockSort ( s );
 +
 +
  }
 +
 +
 +
 +
 +
  s->zbits = (UChar*) (&((UChar*)s->arr2)[s->nblock]);
 +
 +
 +
 +
 +
  /*-- If this is the first block, create the stream header. --*/
 +
 +
  if (s->blockNo == 1) {
 +
 +
      BZ2_bsInitWrite ( s );
 +
 +
      bsPutUChar ( s, BZ_HDR_B );
 +
 +
      bsPutUChar ( s, BZ_HDR_Z );
 +
 +
      bsPutUChar ( s, BZ_HDR_h );
 +
 +
      bsPutUChar ( s, (UChar)(BZ_HDR_0 + s->blockSize100k) );
 +
 +
  }
 +
 +
 +
 +
 +
  if (s->nblock > 0) {
 +
 +
 +
 +
 +
      bsPutUChar ( s, 0x31 ); bsPutUChar ( s, 0x41 );
 +
 +
      bsPutUChar ( s, 0x59 ); bsPutUChar ( s, 0x26 );
 +
 +
      bsPutUChar ( s, 0x53 ); bsPutUChar ( s, 0x59 );
 +
 +
 +
 +
 +
      /*-- Now the block's CRC, so it is in a known place. --*/
 +
 +
      bsPutUInt32 ( s, s->blockCRC );
 +
 +
 +
 +
 +
      /*--
 +
 +
        Now a single bit indicating (non-)randomisation.
 +
 +
        As of version 0.9.5, we use a better sorting algorithm
 +
 +
        which makes randomisation unnecessary.  So always set
 +
 +
        the randomised bit to 'no'.  Of course, the decoder
 +
 +
        still needs to be able to handle randomised blocks
 +
 +
        so as to maintain backwards compatibility with
 +
 +
        older versions of bzip2.
 +
 +
      --*/
 +
 +
      bsW(s,1,0);
 +
 +
 +
 +
 +
      bsW ( s, 24, s->origPtr );
 +
 +
      generateMTFValues ( s );
 +
 +
      sendMTFValues ( s );
 +
 +
  }
 +
 +
 +
 +
 +
 +
 +
 +
  /*-- If this is the last block, add the stream trailer. --*/
 +
 +
  if (is_last_block) {
 +
 +
 +
 +
 +
      bsPutUChar ( s, 0x17 ); bsPutUChar ( s, 0x72 );
 +
 +
      bsPutUChar ( s, 0x45 ); bsPutUChar ( s, 0x38 );
 +
 +
      bsPutUChar ( s, 0x50 ); bsPutUChar ( s, 0x90 );
 +
 +
      bsPutUInt32 ( s, s->combinedCRC );
 +
 +
      if (s->verbosity >= 2)
 +
 +
        VPrintf1( "    final combined CRC = 0x%08x\n  ", s->combinedCRC );
 +
 +
      bsFinishWrite ( s );
 +
 +
  }
 +
 +
}
 +
 +
</pre>
  
 
== Instructor's Comments ==
 
== Instructor's Comments ==

Latest revision as of 10:34, 11 October 2012

Assorted Algorithm Alliteration

Team Members

  1. Mark de la Cruz
  2. Edwin Lim - MD5 Checksum Calculator
  3. Michael Afidchao - Game of Life
  4. eMail All

Game of Life - Michael Afidchao

The Game of Life is a "0 player game" cellular automaton. With an initial configuration, the game uses a set of rules to determine what happens to the life forms from generation to generation. More information can be found at http://en.wikipedia.org/wiki/Conway%27s_Game_of_Life

Source code can be found here:

http://www.shodor.org/media/content//petascale/materials/UPModules/exercises/Game_of_Life/GoL_Serial_Source_Files.zip

Profile

These results were taken with an execution of 50000 generations and default settings for the rest of the options.

Each sample counts as 0.01 seconds.
  %   cumulative   self              self     total
 time   seconds   seconds    calls  Ts/call  Ts/call  name
 55.82     14.76    14.76                             eval_rules
 41.75     25.80    11.04                             do_draw
  2.23     26.39     0.59                             update_grid
  0.19     26.44     0.05                             copy_bounds
  0.00     26.44     0.00    11025     0.00     0.00  rand_double
  0.00     26.44     0.00        1     0.00     0.00  allocate_grids
  0.00     26.44     0.00        1     0.00     0.00  free_grids
  0.00     26.44     0.00        1     0.00     0.00  init_grids
  0.00     26.44     0.00        1     0.00     0.00  moveWindow
  0.00     26.44     0.00        1     0.00     0.00  parse_args
  0.00     26.44     0.00        1     0.00     0.00  randomize_grid
  0.00     26.44     0.00        1     0.00     0.00  setupWindow
  0.00     26.44     0.00        1     0.00     0.00  write_grid

Code Snippet

/*
        eval_rules()
                Evaluate the rules of Life for each cell; count
                neighbors and update current state accordingly.
*/
void eval_rules (struct life_t * life) {
        int i,j,k,l,neighbors;

        int ncols = life->ncols;
        int nrows = life->nrows;

        int ** grid      = life->grid;
        int ** next_grid = life->next_grid;

        for (i = 1; i <= ncols; i++) {
                for (j = 1; j <= nrows; j++) {
                        neighbors = 0;

                        // count neighbors
                        for (k = i-1; k <= i+1; k++) {
                                for (l = j-1; l <= j+1; l++) {
                                        if (!(k == i && l == j) && grid[k][l] != DEAD)
                                                neighbors++;
                                }
                        }

                        // update state
                        if (neighbors < LOWER_THRESH || neighbors > UPPER_THRESH)
                                next_grid[i][j] = DEAD;
                        else if (grid[i][j] != DEAD || neighbors == SPAWN_THRESH)
                                next_grid[i][j] = grid[i][j]+1;
                }
        }
}

MD5 Checksum Calculator - Edwin Lim

The md5 checksum is a commonly used hash function that produces a 128-bit hash value commonly used to check data integrity. It is also used in a wide variety of security applications, however its use in checking data integrity is what will be explored in this assignment.

The full source code for the MD5 checksum calculator can be found here: [1]

Profile

The following result (external link) was found after attempting to generate the md5 checksum for a 6.4gb file (Windows Vista ISO): [2]]

Code Snippet

From analysis of the above profile, the code to be targetted for optimization is the following:

void _MD5_transform
  (unsigned int state[4],
   unsigned char block[64])
{

  unsigned int lA = state[0], lB = state[1], lC = state[2], lD = state[3];
  unsigned int x[16];
 
  _MD5_decode (x, block, 64);

  // round 1
  FF ( lA, lB, lC, lD, x[ 0], S11, 0xd76aa478); // 1
  FF ( lD, lA, lB, lC, x[ 1], S12, 0xe8c7b756); // 2
  FF ( lC, lD, lA, lB, x[ 2], S13, 0x242070db); // 3
  FF ( lB, lC, lD, lA, x[ 3], S14, 0xc1bdceee); // 4
  FF ( lA, lB, lC, lD, x[ 4], S11, 0xf57c0faf); // 5
  FF ( lD, lA, lB, lC, x[ 5], S12, 0x4787c62a); // 6
  FF ( lC, lD, lA, lB, x[ 6], S13, 0xa8304613); // 7
  FF ( lB, lC, lD, lA, x[ 7], S14, 0xfd469501); // 8
  FF ( lA, lB, lC, lD, x[ 8], S11, 0x698098d8); // 9
  FF ( lD, lA, lB, lC, x[ 9], S12, 0x8b44f7af); // 10
  FF ( lC, lD, lA, lB, x[10], S13, 0xffff5bb1); // 11
  FF ( lB, lC, lD, lA, x[11], S14, 0x895cd7be); // 12
  FF ( lA, lB, lC, lD, x[12], S11, 0x6b901122); // 13
  FF ( lD, lA, lB, lC, x[13], S12, 0xfd987193); // 14
  FF ( lC, lD, lA, lB, x[14], S13, 0xa679438e); // 15
  FF ( lB, lC, lD, lA, x[15], S14, 0x49b40821); // 16

  // round 2
  GG ( lA, lB, lC, lD, x[ 1], S21, 0xf61e2562); // 17
  GG ( lD, lA, lB, lC, x[ 6], S22, 0xc040b340); // 18
  GG ( lC, lD, lA, lB, x[11], S23, 0x265e5a51); // 19
  GG ( lB, lC, lD, lA, x[ 0], S24, 0xe9b6c7aa); // 20
  GG ( lA, lB, lC, lD, x[ 5], S21, 0xd62f105d); // 21
  GG ( lD, lA, lB, lC, x[10], S22,  0x2441453); // 22
  GG ( lC, lD, lA, lB, x[15], S23, 0xd8a1e681); // 23
  GG ( lB, lC, lD, lA, x[ 4], S24, 0xe7d3fbc8); // 24
  GG ( lA, lB, lC, lD, x[ 9], S21, 0x21e1cde6); // 25
  GG ( lD, lA, lB, lC, x[14], S22, 0xc33707d6); // 26
  GG ( lC, lD, lA, lB, x[ 3], S23, 0xf4d50d87); // 27
  GG ( lB, lC, lD, lA, x[ 8], S24, 0x455a14ed); // 28
  GG ( lA, lB, lC, lD, x[13], S21, 0xa9e3e905); // 29
  GG ( lD, lA, lB, lC, x[ 2], S22, 0xfcefa3f8); // 30
  GG ( lC, lD, lA, lB, x[ 7], S23, 0x676f02d9); // 31
  GG ( lB, lC, lD, lA, x[12], S24, 0x8d2a4c8a); // 32

  // round 3
  HH ( lA, lB, lC, lD, x[ 5], S31, 0xfffa3942); // 33
  HH ( lD, lA, lB, lC, x[ 8], S32, 0x8771f681); // 34
  HH ( lC, lD, lA, lB, x[11], S33, 0x6d9d6122); // 35
  HH ( lB, lC, lD, lA, x[14], S34, 0xfde5380c); // 36
  HH ( lA, lB, lC, lD, x[ 1], S31, 0xa4beea44); // 37
  HH ( lD, lA, lB, lC, x[ 4], S32, 0x4bdecfa9); // 38
  HH ( lC, lD, lA, lB, x[ 7], S33, 0xf6bb4b60); // 39
  HH ( lB, lC, lD, lA, x[10], S34, 0xbebfbc70); // 40
  HH ( lA, lB, lC, lD, x[13], S31, 0x289b7ec6); // 41
  HH ( lD, lA, lB, lC, x[ 0], S32, 0xeaa127fa); // 42
  HH ( lC, lD, lA, lB, x[ 3], S33, 0xd4ef3085); // 43
  HH ( lB, lC, lD, lA, x[ 6], S34,  0x4881d05); // 44
  HH ( lA, lB, lC, lD, x[ 9], S31, 0xd9d4d039); // 45
  HH ( lD, lA, lB, lC, x[12], S32, 0xe6db99e5); // 46
  HH ( lC, lD, lA, lB, x[15], S33, 0x1fa27cf8); // 47
  HH ( lB, lC, lD, lA, x[ 2], S34, 0xc4ac5665); // 48

  // round 4
  II ( lA, lB, lC, lD, x[ 0], S41, 0xf4292244); // 49
  II ( lD, lA, lB, lC, x[ 7], S42, 0x432aff97); // 50
  II ( lC, lD, lA, lB, x[14], S43, 0xab9423a7); // 51
  II ( lB, lC, lD, lA, x[ 5], S44, 0xfc93a039); // 52
  II ( lA, lB, lC, lD, x[12], S41, 0x655b59c3); // 53
  II ( lD, lA, lB, lC, x[ 3], S42, 0x8f0ccc92); // 54
  II ( lC, lD, lA, lB, x[10], S43, 0xffeff47d); // 55
  II ( lB, lC, lD, lA, x[ 1], S44, 0x85845dd1); // 56
  II ( lA, lB, lC, lD, x[ 8], S41, 0x6fa87e4f); // 57
  II ( lD, lA, lB, lC, x[15], S42, 0xfe2ce6e0); // 58
  II ( lC, lD, lA, lB, x[ 6], S43, 0xa3014314); // 59
  II ( lB, lC, lD, lA, x[13], S44, 0x4e0811a1); // 60
  II ( lA, lB, lC, lD, x[ 4], S41, 0xf7537e82); // 61
  II ( lD, lA, lB, lC, x[11], S42, 0xbd3af235); // 62
  II ( lC, lD, lA, lB, x[ 2], S43, 0x2ad7d2bb); // 63
  II ( lB, lC, lD, lA, x[ 9], S44, 0xeb86d391); // 64

  state[0] += lA;
  state[1] += lB;
  state[2] += lC;
  state[3] += lD;
 
  // lClear sensitive information
  memset(x, 0, 16);
}

Bzip2 - Mark de la Cruz

From http://www.bzip.org/: bzip2 is a freely available, patent free (see below), high-quality data compressor. It typically compresses files to within 10% to 15% of the best available techniques (the PPM family of statistical compressors), whilst being around twice as fast at compression and six times faster at decompression. The full source can be found in the link above.

Below is the profile and the function to be optimized:


Flat profile:




Each sample counts as 0.01 seconds.

  %   cumulative   self              self     total           

 time   seconds   seconds    calls   s/call   s/call  name    

 63.21      7.01     7.01       23     0.30     0.47  BZ2_compressBlock

 28.85     10.21     3.20       23     0.14     0.14  mainSort

  4.78     10.74     0.53        1     0.53     0.53  fallbackSort

  2.34     11.00     0.26     8280     0.00     0.00  default_bzalloc

  0.54     11.06     0.06       23     0.00     0.16  BZ2_blockSort

  0.27     11.09     0.03      552     0.00     0.00  BZ2_hbMakeCodeLengths

  0.00     11.09     0.00   113812     0.00     0.00  add_pair_to_block

  0.00     11.09     0.00    58370     0.00     0.00  _init

  0.00     11.09     0.00     8276     0.00     0.00  BZ2_bzCompress

  0.00     11.09     0.00     4159     0.00     0.00  myfeof

  0.00     11.09     0.00     4158     0.00     0.00  BZ2_bzWrite

  0.00     11.09     0.00      148     0.00     0.00  bsPutUChar

  0.00     11.09     0.00      138     0.00     0.00  BZ2_hbAssignCodes

  0.00     11.09     0.00       24     0.00     0.00  bsPutUInt32

  0.00     11.09     0.00        4     0.00     0.00  default_bzfree

  0.00     11.09     0.00        4     0.00     0.00  hasSuffix

  0.00     11.09     0.00        4     0.00     0.00  testf

  0.00     11.09     0.00        2     0.00     0.00  copyFileName

  0.00     11.09     0.00        2     0.00     0.00  fileExists

  0.00     11.09     0.00        1     0.00     0.00  BZ2_bzCompressEnd

  0.00     11.09     0.00        1     0.00     0.01  BZ2_bzCompressInit

  0.00     11.09     0.00        1     0.00     0.24  BZ2_bzWriteClose64

  0.00     11.09     0.00        1     0.00     0.01  BZ2_bzWriteOpen

  0.00     11.09     0.00        1     0.00     0.00  applySavedFileAttrToOutputFile

  0.00     11.09     0.00        1     0.00    11.09  compressStream







|===============================================================================|

Source code to compressBlock




/*---------------------------------------------------*/

void BZ2_compressBlock ( EState* s, Bool is_last_block )

{

   if (s->nblock > 0) {




      BZ_FINALISE_CRC ( s->blockCRC );

      s->combinedCRC = (s->combinedCRC << 1) | (s->combinedCRC >> 31);

      s->combinedCRC ^= s->blockCRC;

      if (s->blockNo > 1) s->numZ = 0;




      if (s->verbosity >= 2)

         VPrintf4( "    block %d: crc = 0x%08x, "

                   "combined CRC = 0x%08x, size = %d\n",

                   s->blockNo, s->blockCRC, s->combinedCRC, s->nblock );




      BZ2_blockSort ( s );

   }




   s->zbits = (UChar*) (&((UChar*)s->arr2)[s->nblock]);




   /*-- If this is the first block, create the stream header. --*/

   if (s->blockNo == 1) {

      BZ2_bsInitWrite ( s );

      bsPutUChar ( s, BZ_HDR_B );

      bsPutUChar ( s, BZ_HDR_Z );

      bsPutUChar ( s, BZ_HDR_h );

      bsPutUChar ( s, (UChar)(BZ_HDR_0 + s->blockSize100k) );

   }




   if (s->nblock > 0) {




      bsPutUChar ( s, 0x31 ); bsPutUChar ( s, 0x41 );

      bsPutUChar ( s, 0x59 ); bsPutUChar ( s, 0x26 );

      bsPutUChar ( s, 0x53 ); bsPutUChar ( s, 0x59 );




      /*-- Now the block's CRC, so it is in a known place. --*/

      bsPutUInt32 ( s, s->blockCRC );




      /*-- 

         Now a single bit indicating (non-)randomisation. 

         As of version 0.9.5, we use a better sorting algorithm

         which makes randomisation unnecessary.  So always set

         the randomised bit to 'no'.  Of course, the decoder

         still needs to be able to handle randomised blocks

         so as to maintain backwards compatibility with

         older versions of bzip2.

      --*/

      bsW(s,1,0);




      bsW ( s, 24, s->origPtr );

      generateMTFValues ( s );

      sendMTFValues ( s );

   }







   /*-- If this is the last block, add the stream trailer. --*/

   if (is_last_block) {




      bsPutUChar ( s, 0x17 ); bsPutUChar ( s, 0x72 );

      bsPutUChar ( s, 0x45 ); bsPutUChar ( s, 0x38 );

      bsPutUChar ( s, 0x50 ); bsPutUChar ( s, 0x90 );

      bsPutUInt32 ( s, s->combinedCRC );

      if (s->verbosity >= 2)

         VPrintf1( "    final combined CRC = 0x%08x\n   ", s->combinedCRC );

      bsFinishWrite ( s );

   }

}

Instructor's Comments