这几天把以前的资料整理了下,感觉之前在网上找的lzw代码太臃肿,就用C重写了一个。在之前的博文已经把基本的算法步骤列出来了,压缩过程比较简单,但解压过程有点繁琐,需要注意的是解压过程中,当读到的编码值和当前编码值相等的时候该如何处理,另外就是如何处理变长编码。代码中有两个hash函数,一个是直接将前后缀的二进制拼接在一起,一个是JSHash。程序在压缩的时候会输出一些信息,方便理解压缩的过程,解压结果也会输出在屏幕上。压缩结果保存在以lzw为后缀的文件里,解压结果保存在以copy为后缀的文件里。理论上任意文件都能正确地压缩和解压缩,欢迎试用和反馈。
PS:写压缩程序就像制作一块机械表,关注每一个bit每一个byte的意义,需要高度地专注才能完成这么精细的工作,写完很有成就感:),不过写出bug也挺惨的:(
图1展示了对一个文本文件的压缩过程,cat txt显示了待压缩文本的内容。现在来看下压缩过程,前两个A相组合,字典中找不到,将前缀A以9个bit位输出到缓冲区,后缀A作为前缀,继续读取字符,并将其作为后缀。读到的仍然是A,这时组合(A,A)已经存在于字典中,将组合(A,A)的编码值256作为前缀,继续读下一个字符,以此类推,直到当前组合不存在于字典中,即(256,A)不在字典中,将256以9个bit位输出到缓冲区。按照这种方式一直处理到文件尾。解压的步骤见之前的博客和源代码。
图1 压缩过程和解压结果
源代码,当中包括了调试用的断言和输出。
1 #include <time.h> 2 #include <stdlib.h> 3 #include <stdio.h> 4 #include <assert.h> 5 #include <string.h> 6 7 unsigned char *m_file_buffer; 8 unsigned char *m_comp_buffer; 9 unsigned int *prefix_table; 10 unsigned char *suffix_table; 11 unsigned int *hash_table; 12 unsigned int m_file_buffer_cnt; 13 unsigned int m_comp_buffer_cnt; 14 unsigned char decode_stack[4000]; 15 16 //bit buffer 17 unsigned int m_encode_bit_buffer; 18 unsigned int m_decode_bit_buffer; 19 int m_encode_bit_buffer_cnt; 20 int m_decode_bit_buffer_cnt; 21 22 size_t m_file_size; 23 24 #define COMP_BUFFER_SIZE 1024 25 #define MIN_CODE_BITS 9 26 #define MAX_CODE_BITS 20 27 #define MAX_CODE_VALUE ((1 << MAX_CODE_BITS) -1) 28 #define CURR_MAX_CODE_VALUE(x) ( (1u << x) -1) 29 #define FIRST_CODE ( 1<< (MIN_CODE_BITS - 1)) 30 #define hash_shift (MAX_CODE_BITS - 8) 31 #define HASH_TABLE_SIZE 1048583 //HASH_TABLE_SIZE应该是大于MAX_CODE_VALUE的第一个素数 32 33 size_t get_file_size(FILE *file) { 34 fseek(file,0,SEEK_END); 35 size_t file_size = ftell(file); 36 fseek(file,0,SEEK_SET); 37 return file_size; 38 } 39 40 void __saveBits(int nCode,int nBitCnt) 41 { 42 assert(nBitCnt <= 25); 43 if(-1 == nCode) { 44 if(m_encode_bit_buffer_cnt) { 45 m_encode_bit_buffer >>= 24; 46 m_comp_buffer[ m_comp_buffer_cnt++ ] = (unsigned char)m_encode_bit_buffer; 47 m_encode_bit_buffer_cnt = 0; 48 m_encode_bit_buffer = 0u; 49 } 50 return; 51 } 52 m_encode_bit_buffer |= ((unsigned int) nCode) << (32 - nBitCnt - m_encode_bit_buffer_cnt); 53 m_encode_bit_buffer_cnt += nBitCnt; 54 while (m_encode_bit_buffer_cnt >= 8) { 55 m_comp_buffer[ m_comp_buffer_cnt++ ] = (unsigned char )(m_encode_bit_buffer >> 24 ); 56 m_encode_bit_buffer <<= 8; 57 m_encode_bit_buffer_cnt -= 8; 58 } 59 } 60 unsigned int __readBits(int nBitCnt) 61 { 62 assert(nBitCnt <= 25 && nBitCnt >= 1); 63 int c; 64 unsigned int return_value; 65 while (m_decode_bit_buffer_cnt <= 24 && m_file_buffer_cnt < m_file_size) { 66 c = m_file_buffer [ m_file_buffer_cnt++ ]; 67 m_decode_bit_buffer |= (unsigned int) c << (24-m_decode_bit_buffer_cnt); 68 m_decode_bit_buffer_cnt += 8; 69 } 70 if(m_decode_bit_buffer_cnt < (unsigned)nBitCnt) { 71 return -1; /* EOF */ 72 } 73 return_value=m_decode_bit_buffer >> (32-nBitCnt); 74 m_decode_bit_buffer <<= nBitCnt; 75 m_decode_bit_buffer_cnt -= nBitCnt; 76 return(return_value); 77 } 78 79 inline unsigned int hash_func(const unsigned int prefix,const unsigned char suffix) { 80 return (suffix << hash_shift) ^ prefix; 81 } 82 inline unsigned int JSHash(const unsigned int prefix,const unsigned char suffix){ 83 unsigned int hash = 1315423911; 84 hash ^= ((hash<<5) + prefix + (hash>>2)); 85 hash ^= ((hash<<5) + suffix + (hash>>2)); 86 return (hash & ( (1<< (MAX_CODE_BITS+1)) - 1));// 87 } 88 inline void putcc(unsigned int value) { 89 assert ( value >= 0 && value < MAX_CODE_VALUE); 90 if ( value < 256 ) { 91 if ( value == '\n' ) { 92 printf("\\n"); 93 } 94 else 95 printf("%c",value); 96 } 97 else 98 printf("%u",value); 99 } 100 void putin_table(const unsigned int prefix,const unsigned char suffix, 101 const int hash_index,unsigned int *code, 102 int *curr_bits ) { 103 if ( *code < MAX_CODE_VALUE ) { 104 hash_table[ hash_index ] = (*code)++; 105 prefix_table[hash_index] = prefix; 106 suffix_table[hash_index] = suffix; 107 } 108 109 if ( prefix >= CURR_MAX_CODE_VALUE(*curr_bits) && *curr_bits < MAX_CODE_BITS ) { 110 __saveBits(CURR_MAX_CODE_VALUE(*curr_bits),*curr_bits); 111 (*curr_bits)++; 112 } 113 } 114 115 116 void compress(const char *out_file_name) { 117 118 //init 119 FILE *file = fopen(out_file_name,"wb"); 120 if ( file == NULL ) { 121 puts("open file fail\n"); 122 return ; 123 } 124 hash_table = (unsigned int * ) malloc ( sizeof(unsigned int) * HASH_TABLE_SIZE); 125 prefix_table = (unsigned int *) malloc ( sizeof(unsigned int) * HASH_TABLE_SIZE); 126 suffix_table = (unsigned char *) malloc ( sizeof(unsigned char) * HASH_TABLE_SIZE); 127 memset(hash_table,-1,sizeof(unsigned int) * HASH_TABLE_SIZE); 128 129 m_file_buffer_cnt = 0; 130 m_comp_buffer_cnt = 0; 131 m_encode_bit_buffer = 0u; 132 m_encode_bit_buffer_cnt = 0; 133 134 __saveBits(MIN_CODE_BITS,8); 135 assert ( m_file_size < ( 1 << 20) ); 136 __saveBits(m_file_size,20); 137 138 unsigned int STRING = m_file_buffer[ m_file_buffer_cnt++ ]; 139 printf("(%c,",STRING); 140 unsigned int code = FIRST_CODE; 141 int curr_bits = MIN_CODE_BITS; 142 143 while ( m_file_buffer_cnt < m_file_size ) { 144 if (m_comp_buffer_cnt >= COMP_BUFFER_SIZE) { 145 //write to disk 146 fwrite(m_comp_buffer,1,m_comp_buffer_cnt,file); 147 printf("%d\n",m_comp_buffer_cnt); 148 m_comp_buffer_cnt = 0; 149 } 150 151 unsigned char CHARACTER = m_file_buffer[m_file_buffer_cnt++]; 152 putcc(CHARACTER);printf(")"); 153 154 int hash_index = hash_func(STRING,CHARACTER); 155 int offset; 156 if ( hash_index == 0 ) 157 offset = 1; 158 else 159 offset = HASH_TABLE_SIZE - hash_index; 160 161 while ( 1 ) { 162 if( hash_table[ hash_index ] == -1 ) { 163 //case:not in the table 164 printf("\tNO"); 165 //add STRING + CHARACTER to the table 166 printf("\t%d\t",code); 167 putin_table(STRING,CHARACTER,hash_index,&code,&curr_bits); 168 //output the code for STRING 169 __saveBits(STRING,curr_bits);//如果冗余度不高,很可能会增加编码量 170 putcc(STRING);printf(" in %d bits\n",curr_bits); 171 172 STRING = CHARACTER; 173 printf("(");putcc(CHARACTER);printf(","); 174 break; 175 } 176 else if( prefix_table[hash_index] == STRING 177 && suffix_table[hash_index] == CHARACTER) { 178 STRING = hash_table[hash_index]; 179 printf("\tYES\n(%u,",STRING); 180 break; 181 } 182 else{ 183 } 184 185 hash_index -= offset; 186 if (hash_index < 0 ) 187 hash_index += HASH_TABLE_SIZE; 188 } 189 } 190 __saveBits(STRING,curr_bits); 191 printf(")\tNO\t");putcc(STRING);printf("in %d bits\n",curr_bits); 192 193 //clear bit buffer 194 __saveBits(-1,8); 195 196 fwrite(m_comp_buffer,1,m_comp_buffer_cnt,file); 197 198 fclose(file); file = NULL; 199 free(hash_table); hash_table = NULL; 200 free(prefix_table); prefix_table = NULL; 201 free(suffix_table); suffix_table = NULL; 202 } 203 204 unsigned char *decode_string(unsigned char *decode_stack,unsigned int code) { 205 while ( prefix_table[code] != -1) { 206 *decode_stack++ = suffix_table[code]; 207 code = prefix_table[code]; 208 } 209 *decode_stack = code; 210 return decode_stack; 211 } 212 void decompress(const char *out_file_name) { 213 214 puts("\n[decompress]\n"); 215 FILE *file = fopen(out_file_name,"wb"); 216 if ( file == NULL ) { 217 puts("open file error!\n"); 218 } 219 220 //init 221 prefix_table = (unsigned int *) malloc ( sizeof(unsigned int) * HASH_TABLE_SIZE); 222 suffix_table = (unsigned char *) malloc ( sizeof(unsigned char) * HASH_TABLE_SIZE); 223 memset(prefix_table,-1,sizeof(unsigned int) * HASH_TABLE_SIZE); 224 225 m_file_buffer_cnt = 0; 226 m_comp_buffer_cnt = 0; 227 m_decode_bit_buffer = 0u; 228 m_decode_bit_buffer_cnt = 0; 229 230 231 int curr_bits = __readBits(8); 232 if ( curr_bits != MIN_CODE_BITS ) { 233 puts ("decode error!\n"); 234 return; 235 } 236 int origin_file_size = __readBits(20); 237 m_comp_buffer = (unsigned char *) malloc ( sizeof(unsigned char) * origin_file_size); 238 239 unsigned int code = (1 << (curr_bits - 1)); 240 assert ( code == FIRST_CODE ); 241 242 unsigned int CHARACTER; 243 unsigned int OLD_CODE = __readBits(curr_bits); 244 m_comp_buffer[m_comp_buffer_cnt++] = OLD_CODE; printf("%c",OLD_CODE); 245 246 while ( 1) { 247 unsigned int NEW_CODE = __readBits(curr_bits); 248 if ( NEW_CODE == CURR_MAX_CODE_VALUE(curr_bits)) { 249 curr_bits++; 250 continue; 251 } 252 else if ( NEW_CODE == (unsigned) -1) { 253 break; 254 } 255 256 if ( NEW_CODE < 256 ) { 257 m_comp_buffer[m_comp_buffer_cnt++]= NEW_CODE; printf("%c",NEW_CODE); 258 CHARACTER = NEW_CODE; 259 } 260 else if ( prefix_table[NEW_CODE] != -1) { 261 *decode_stack = suffix_table[NEW_CODE]; 262 unsigned char *t = decode_string(decode_stack+1,prefix_table[NEW_CODE]); 263 CHARACTER = *t; 264 while( t >= decode_stack ) { 265 m_comp_buffer[m_comp_buffer_cnt++]= *t; printf("%c",*t--); 266 } 267 } 268 else if ( NEW_CODE == code ) { 269 if ( prefix_table[OLD_CODE] == -1){ 270 CHARACTER = OLD_CODE; 271 assert ( OLD_CODE < 256 ); 272 } 273 else { 274 assert ( prefix_table[OLD_CODE] != - 1); 275 unsigned char *t = decode_string(decode_stack,prefix_table[OLD_CODE]); 276 CHARACTER = *t;// 277 } 278 } 279 else { 280 assert ( 1 == 0 ); 281 } 282 283 //OLD_CODE + CHARACTER 284 prefix_table[code] = OLD_CODE; 285 suffix_table[code] = CHARACTER; 286 287 if ( code == NEW_CODE ) { 288 if( prefix_table[OLD_CODE] == -1){ 289 m_comp_buffer[m_comp_buffer_cnt++]= OLD_CODE; printf("%c",OLD_CODE); 290 } 291 else { 292 *decode_stack = suffix_table[OLD_CODE]; 293 unsigned char *t = decode_string(decode_stack+1,prefix_table[OLD_CODE]); 294 while( t >= decode_stack ) { 295 m_comp_buffer[m_comp_buffer_cnt++]= *t; printf("%c",*t--); 296 } 297 } 298 m_comp_buffer[m_comp_buffer_cnt++]= CHARACTER; printf("%c",CHARACTER); 299 } 300 code++; 301 302 OLD_CODE = NEW_CODE; 303 } 304 305 fwrite(m_comp_buffer,1,m_comp_buffer_cnt,file); 306 assert( m_comp_buffer_cnt == origin_file_size); 307 308 //free 309 fclose(file); file = NULL; 310 free(prefix_table); prefix_table = NULL; 311 free(suffix_table); suffix_table = NULL; 312 free(m_comp_buffer); m_comp_buffer = NULL; 313 314 } 315 316 int main(int argc,char *argv[]) { 317 318 if ( argc != 2 ) { 319 puts("usage: lzw [file path]\n"); 320 return 0; 321 } 322 323 //init 324 m_file_buffer = NULL; 325 m_comp_buffer = NULL; 326 prefix_table = NULL; 327 suffix_table = NULL; 328 hash_table = NULL; 329 330 FILE *file = fopen(argv[1],"rb"); //open file 331 if ( file == NULL ) { 332 puts ("open file fail!\n"); 333 } 334 335 m_file_size = get_file_size(file); 336 if ( m_file_size == 0 ) { 337 puts("empty file"); 338 return 0; 339 } 340 m_file_buffer = (unsigned char *) malloc ( sizeof(unsigned char) * m_file_size); 341 m_comp_buffer = (unsigned char *) malloc ( sizeof(unsigned char) * (COMP_BUFFER_SIZE+3)); 342 fread ( m_file_buffer, 1, m_file_size, file); //read to buffer 343 344 strcat(argv[1],".lzw"); 345 compress(argv[1]); 346 347 //free 348 free(m_file_buffer); m_file_buffer = NULL; fclose(file); file = NULL; 349 free(m_comp_buffer); m_comp_buffer = NULL; 350 351 //decompress 352 file = fopen(argv[1],"rb"); 353 if ( file == NULL ) { 354 puts("open file fail!\n"); 355 return 0; 356 } 357 358 m_file_size = get_file_size(file); 359 m_file_buffer = (unsigned char *) malloc ( sizeof(unsigned char) * m_file_size ); 360 fread ( m_file_buffer, 1, m_file_size, file); //read to buffer 361 362 strcpy(argv[1]+ strlen( argv[1] ) - 3,"copy"); 363 decompress(argv[1]); 364 365 //free 366 free(m_file_buffer); m_file_buffer = NULL; 367 free(m_comp_buffer); m_comp_buffer = NULL; 368 369 assert ( m_file_buffer == NULL); 370 assert ( m_comp_buffer == NULL); 371 assert ( hash_table == NULL); 372 assert ( prefix_table == NULL); 373 assert ( suffix_table == NULL); 374 375 return 0; 376 } 377