LZW Demo

update@2013-09-01

这几天把以前的资料整理了下,感觉之前在网上找的lzw代码太臃肿,就用C重写了一个。在之前的博文已经把基本的算法步骤列出来了,压缩过程比较简单,但解压过程有点繁琐,需要注意的是解压过程中,当读到的编码值和当前编码值相等的时候该如何处理,另外就是如何处理变长编码。代码中有两个hash函数,一个是直接将前后缀的二进制拼接在一起,一个是JSHash。程序在压缩的时候会输出一些信息,方便理解压缩的过程,解压结果也会输出在屏幕上。压缩结果保存在以lzw为后缀的文件里,解压结果保存在以copy为后缀的文件里。理论上任意文件都能正确地压缩和解压缩,欢迎试用和反馈。

PS:写压缩程序就像制作一块机械表,关注每一个bit每一个byte的意义,需要高度地专注才能完成这么精细的工作,写完很有成就感:),不过写出bug也挺惨的:(


  1. example
  2. source code


1 example

图1展示了对一个文本文件的压缩过程,cat txt显示了待压缩文本的内容。现在来看下压缩过程,前两个A相组合,字典中找不到,将前缀A以9个bit位输出到缓冲区,后缀A作为前缀,继续读取字符,并将其作为后缀。读到的仍然是A,这时组合(A,A)已经存在于字典中,将组合(A,A)的编码值256作为前缀,继续读下一个字符,以此类推,直到当前组合不存在于字典中,即(256,A)不在字典中,将256以9个bit位输出到缓冲区。按照这种方式一直处理到文件尾。解压的步骤见之前的博客和源代码。

图1 压缩过程和解压结果

2 source code

源代码,当中包括了调试用的断言和输出。

    1 #include <time.h>
    2 #include <stdlib.h>
    3 #include <stdio.h>
    4 #include <assert.h>
    5 #include <string.h>
    6 
    7 unsigned char *m_file_buffer;
    8 unsigned char *m_comp_buffer;
    9 unsigned int *prefix_table;
   10 unsigned char *suffix_table;
   11 unsigned int *hash_table;
   12 unsigned int m_file_buffer_cnt;
   13 unsigned int m_comp_buffer_cnt;
   14 unsigned char decode_stack[4000];
   15 
   16 //bit buffer
   17 unsigned int m_encode_bit_buffer;
   18 unsigned int m_decode_bit_buffer;
   19 int m_encode_bit_buffer_cnt;
   20 int m_decode_bit_buffer_cnt;
   21 
   22 size_t m_file_size;
   23 
   24 #define COMP_BUFFER_SIZE 1024
   25 #define MIN_CODE_BITS 9
   26 #define MAX_CODE_BITS 20
   27 #define MAX_CODE_VALUE ((1 << MAX_CODE_BITS) -1)
   28 #define CURR_MAX_CODE_VALUE(x) ( (1u << x) -1)
   29 #define FIRST_CODE ( 1<< (MIN_CODE_BITS - 1))
   30 #define hash_shift  (MAX_CODE_BITS - 8)
   31 #define HASH_TABLE_SIZE 1048583 //HASH_TABLE_SIZE应该是大于MAX_CODE_VALUE的第一个素数
   32 
   33 size_t get_file_size(FILE *file) {
   34     fseek(file,0,SEEK_END);
   35     size_t file_size = ftell(file);
   36     fseek(file,0,SEEK_SET);
   37     return file_size;
   38 }
   39 
   40 void __saveBits(int nCode,int nBitCnt)
   41 {
   42     assert(nBitCnt <= 25);
   43     if(-1 == nCode) {
   44         if(m_encode_bit_buffer_cnt) {
   45             m_encode_bit_buffer >>= 24;
   46             m_comp_buffer[ m_comp_buffer_cnt++ ] = (unsigned char)m_encode_bit_buffer;
   47             m_encode_bit_buffer_cnt = 0;
   48             m_encode_bit_buffer     = 0u;
   49         }
   50         return;
   51     }
   52     m_encode_bit_buffer |= ((unsigned int) nCode) << (32 - nBitCnt - m_encode_bit_buffer_cnt);
   53     m_encode_bit_buffer_cnt += nBitCnt;
   54     while (m_encode_bit_buffer_cnt >= 8) {
   55         m_comp_buffer[ m_comp_buffer_cnt++ ] = (unsigned char )(m_encode_bit_buffer >> 24 );
   56         m_encode_bit_buffer    <<= 8;
   57         m_encode_bit_buffer_cnt -= 8;
   58     }
   59 }
   60 unsigned int __readBits(int nBitCnt)
   61 {
   62     assert(nBitCnt <= 25 && nBitCnt >= 1);
   63     int c;
   64     unsigned int return_value;
   65     while (m_decode_bit_buffer_cnt <= 24 && m_file_buffer_cnt < m_file_size) {
   66         c = m_file_buffer [ m_file_buffer_cnt++ ];
   67         m_decode_bit_buffer |= (unsigned int) c << (24-m_decode_bit_buffer_cnt);
   68         m_decode_bit_buffer_cnt += 8;
   69     }
   70     if(m_decode_bit_buffer_cnt < (unsigned)nBitCnt) {
   71         return -1; /* EOF */
   72     }
   73     return_value=m_decode_bit_buffer >> (32-nBitCnt);
   74     m_decode_bit_buffer <<= nBitCnt;
   75     m_decode_bit_buffer_cnt -= nBitCnt;
   76     return(return_value);
   77 }
   78 
   79 inline unsigned int hash_func(const unsigned int prefix,const unsigned char suffix) {
   80     return (suffix << hash_shift) ^ prefix;
   81 }
   82 inline unsigned int JSHash(const unsigned int prefix,const unsigned char suffix){
   83     unsigned int hash = 1315423911;
   84     hash ^= ((hash<<5) + prefix + (hash>>2));
   85     hash ^= ((hash<<5) + suffix + (hash>>2));
   86     return (hash & ( (1<< (MAX_CODE_BITS+1)) - 1));//
   87 }
   88 inline void putcc(unsigned int value) {
   89     assert ( value >= 0 && value < MAX_CODE_VALUE);
   90     if ( value < 256 ) {
   91         if ( value == '\n' ) {
   92             printf("\\n");
   93         }
   94         else
   95             printf("%c",value);
   96     }
   97     else
   98         printf("%u",value);
   99 }
  100 void putin_table(const unsigned int prefix,const unsigned char suffix,
  101         const int hash_index,unsigned int *code,
  102         int *curr_bits ) {
  103     if ( *code < MAX_CODE_VALUE ) {
  104         hash_table[ hash_index ] = (*code)++;
  105         prefix_table[hash_index] = prefix;
  106         suffix_table[hash_index] = suffix;
  107     }
  108 
  109     if ( prefix >= CURR_MAX_CODE_VALUE(*curr_bits) && *curr_bits < MAX_CODE_BITS ) {
  110         __saveBits(CURR_MAX_CODE_VALUE(*curr_bits),*curr_bits);
  111         (*curr_bits)++;
  112     }
  113 }
  114 
  115 
  116 void compress(const char *out_file_name) {
  117 
  118     //init
  119     FILE *file = fopen(out_file_name,"wb");
  120     if ( file == NULL ) {
  121         puts("open file fail\n");
  122         return ;
  123     }
  124     hash_table = (unsigned int * ) malloc ( sizeof(unsigned int) * HASH_TABLE_SIZE);
  125     prefix_table = (unsigned int *) malloc ( sizeof(unsigned int) * HASH_TABLE_SIZE);
  126     suffix_table = (unsigned char *) malloc ( sizeof(unsigned char) * HASH_TABLE_SIZE);
  127     memset(hash_table,-1,sizeof(unsigned int) * HASH_TABLE_SIZE);
  128 
  129     m_file_buffer_cnt = 0;
  130     m_comp_buffer_cnt = 0;
  131     m_encode_bit_buffer = 0u;
  132     m_encode_bit_buffer_cnt = 0;
  133 
  134     __saveBits(MIN_CODE_BITS,8);
  135     assert ( m_file_size < ( 1 << 20) );
  136     __saveBits(m_file_size,20);
  137 
  138     unsigned int STRING = m_file_buffer[ m_file_buffer_cnt++  ];
  139     printf("(%c,",STRING);
  140     unsigned int code = FIRST_CODE;
  141     int curr_bits =  MIN_CODE_BITS;
  142 
  143     while ( m_file_buffer_cnt < m_file_size ) {
  144         if (m_comp_buffer_cnt >= COMP_BUFFER_SIZE) {
  145             //write to disk
  146             fwrite(m_comp_buffer,1,m_comp_buffer_cnt,file);
  147             printf("%d\n",m_comp_buffer_cnt);
  148             m_comp_buffer_cnt = 0;
  149         }
  150         
  151         unsigned char CHARACTER = m_file_buffer[m_file_buffer_cnt++];
  152         putcc(CHARACTER);printf(")");
  153 
  154         int hash_index = hash_func(STRING,CHARACTER);
  155         int offset;
  156         if ( hash_index == 0 )
  157             offset = 1;
  158         else
  159             offset = HASH_TABLE_SIZE - hash_index;
  160 
  161         while ( 1 ) {
  162             if( hash_table[ hash_index ] == -1 ) {
  163                 //case:not in the table
  164                 printf("\tNO");
  165                 //add STRING + CHARACTER to the table
  166                 printf("\t%d\t",code);
  167                 putin_table(STRING,CHARACTER,hash_index,&code,&curr_bits);
  168                 //output the code for STRING
  169                 __saveBits(STRING,curr_bits);//如果冗余度不高,很可能会增加编码量
  170                 putcc(STRING);printf(" in %d bits\n",curr_bits);
  171 
  172                 STRING = CHARACTER;
  173                 printf("(");putcc(CHARACTER);printf(",");
  174                 break;
  175             }
  176             else if( prefix_table[hash_index] == STRING
  177                     && suffix_table[hash_index] == CHARACTER) {
  178                 STRING = hash_table[hash_index];    
  179                 printf("\tYES\n(%u,",STRING);
  180                 break;
  181             }
  182             else{
  183             }
  184 
  185             hash_index -= offset;
  186             if (hash_index < 0 )
  187                 hash_index += HASH_TABLE_SIZE;
  188         }
  189     }
  190     __saveBits(STRING,curr_bits);
  191     printf(")\tNO\t");putcc(STRING);printf("in %d bits\n",curr_bits);
  192 
  193     //clear bit buffer
  194     __saveBits(-1,8);
  195 
  196     fwrite(m_comp_buffer,1,m_comp_buffer_cnt,file);
  197 
  198     fclose(file); file = NULL;
  199     free(hash_table); hash_table = NULL;
  200     free(prefix_table); prefix_table = NULL;
  201     free(suffix_table); suffix_table = NULL;
  202 }
  203 
  204 unsigned char *decode_string(unsigned char *decode_stack,unsigned int code) {
  205     while ( prefix_table[code] != -1) {
  206         *decode_stack++ = suffix_table[code];
  207         code = prefix_table[code];
  208     }
  209     *decode_stack = code;
  210     return decode_stack;
  211 }
  212 void decompress(const char *out_file_name) {
  213 
  214     puts("\n[decompress]\n");
  215     FILE *file = fopen(out_file_name,"wb");
  216     if ( file == NULL ) {
  217         puts("open file error!\n");
  218     }
  219     
  220     //init
  221     prefix_table = (unsigned int *) malloc ( sizeof(unsigned int) * HASH_TABLE_SIZE);
  222     suffix_table = (unsigned char *) malloc ( sizeof(unsigned char) * HASH_TABLE_SIZE);
  223     memset(prefix_table,-1,sizeof(unsigned int) * HASH_TABLE_SIZE);
  224 
  225     m_file_buffer_cnt = 0;
  226     m_comp_buffer_cnt = 0;
  227     m_decode_bit_buffer = 0u;
  228     m_decode_bit_buffer_cnt = 0;
  229 
  230 
  231     int curr_bits = __readBits(8);
  232     if ( curr_bits != MIN_CODE_BITS ) {
  233         puts ("decode error!\n");
  234         return;
  235     }
  236     int origin_file_size = __readBits(20);
  237     m_comp_buffer = (unsigned char *) malloc ( sizeof(unsigned char) * origin_file_size);
  238 
  239     unsigned int code = (1 << (curr_bits - 1));
  240     assert ( code == FIRST_CODE );
  241 
  242     unsigned int CHARACTER;
  243     unsigned int OLD_CODE = __readBits(curr_bits);
  244     m_comp_buffer[m_comp_buffer_cnt++] = OLD_CODE; printf("%c",OLD_CODE);
  245 
  246     while ( 1) {
  247         unsigned int NEW_CODE = __readBits(curr_bits);
  248         if ( NEW_CODE == CURR_MAX_CODE_VALUE(curr_bits)) {
  249             curr_bits++;
  250             continue;
  251         }
  252         else if ( NEW_CODE == (unsigned) -1) {
  253             break;
  254         }
  255 
  256         if ( NEW_CODE < 256 ) {
  257             m_comp_buffer[m_comp_buffer_cnt++]= NEW_CODE; printf("%c",NEW_CODE);
  258             CHARACTER = NEW_CODE;
  259         }
  260         else if ( prefix_table[NEW_CODE] != -1) {
  261             *decode_stack = suffix_table[NEW_CODE];
  262             unsigned char *t =  decode_string(decode_stack+1,prefix_table[NEW_CODE]);
  263             CHARACTER = *t;
  264             while( t >= decode_stack ) {
  265                 m_comp_buffer[m_comp_buffer_cnt++]= *t; printf("%c",*t--);
  266             }
  267         }
  268         else if ( NEW_CODE == code ) {
  269             if ( prefix_table[OLD_CODE] == -1){
  270                 CHARACTER = OLD_CODE;
  271                 assert ( OLD_CODE < 256 );
  272             }
  273             else {
  274                 assert ( prefix_table[OLD_CODE] != - 1);
  275                 unsigned char *t = decode_string(decode_stack,prefix_table[OLD_CODE]);
  276                 CHARACTER = *t;//
  277             }
  278         }
  279         else {
  280             assert ( 1 == 0 );
  281         }
  282 
  283         //OLD_CODE + CHARACTER
  284         prefix_table[code] = OLD_CODE;
  285         suffix_table[code] = CHARACTER;
  286 
  287         if ( code == NEW_CODE ) {
  288             if( prefix_table[OLD_CODE] == -1){
  289                 m_comp_buffer[m_comp_buffer_cnt++]= OLD_CODE; printf("%c",OLD_CODE);
  290             }
  291             else {
  292                 *decode_stack = suffix_table[OLD_CODE];
  293                 unsigned char *t =  decode_string(decode_stack+1,prefix_table[OLD_CODE]);
  294                 while( t >= decode_stack ) {
  295                     m_comp_buffer[m_comp_buffer_cnt++]= *t; printf("%c",*t--);
  296                 }
  297             }
  298             m_comp_buffer[m_comp_buffer_cnt++]= CHARACTER; printf("%c",CHARACTER);
  299         }
  300         code++;
  301 
  302         OLD_CODE = NEW_CODE;
  303     }
  304 
  305     fwrite(m_comp_buffer,1,m_comp_buffer_cnt,file);
  306     assert( m_comp_buffer_cnt == origin_file_size);
  307 
  308     //free
  309     fclose(file); file = NULL;
  310     free(prefix_table); prefix_table = NULL;
  311     free(suffix_table); suffix_table = NULL;
  312     free(m_comp_buffer); m_comp_buffer = NULL;
  313         
  314 }
  315 
  316 int main(int argc,char *argv[]) {
  317 
  318     if ( argc != 2 ) {
  319         puts("usage: lzw [file path]\n");
  320         return 0;
  321     }
  322 
  323     //init
  324     m_file_buffer = NULL;
  325     m_comp_buffer = NULL;
  326     prefix_table = NULL;
  327     suffix_table = NULL;
  328     hash_table   = NULL;
  329     
  330     FILE *file = fopen(argv[1],"rb"); //open file
  331     if ( file == NULL ) {
  332         puts ("open file fail!\n");
  333     }
  334 
  335     m_file_size = get_file_size(file);
  336     if ( m_file_size == 0 ) {
  337         puts("empty file");
  338         return 0;
  339     }
  340     m_file_buffer = (unsigned char *) malloc ( sizeof(unsigned char) * m_file_size);
  341     m_comp_buffer = (unsigned char *) malloc ( sizeof(unsigned char) * (COMP_BUFFER_SIZE+3));
  342     fread ( m_file_buffer, 1, m_file_size, file); //read to buffer
  343 
  344     strcat(argv[1],".lzw");
  345     compress(argv[1]);
  346     
  347     //free
  348     free(m_file_buffer); m_file_buffer = NULL; fclose(file); file = NULL;
  349     free(m_comp_buffer); m_comp_buffer = NULL;
  350     
  351     //decompress
  352     file = fopen(argv[1],"rb");
  353     if ( file == NULL ) {
  354         puts("open file fail!\n");
  355         return 0;
  356     }
  357 
  358     m_file_size = get_file_size(file);
  359     m_file_buffer = (unsigned char *) malloc ( sizeof(unsigned char) * m_file_size );
  360     fread ( m_file_buffer, 1, m_file_size, file); //read to buffer
  361 
  362     strcpy(argv[1]+ strlen( argv[1] ) - 3,"copy");
  363     decompress(argv[1]);
  364     
  365     //free
  366     free(m_file_buffer); m_file_buffer = NULL;
  367     free(m_comp_buffer); m_comp_buffer = NULL;
  368 
  369     assert ( m_file_buffer == NULL);
  370     assert ( m_comp_buffer == NULL);
  371     assert ( hash_table == NULL);
  372     assert ( prefix_table == NULL);
  373     assert ( suffix_table == NULL);
  374 
  375     return 0;
  376 }
  377