mnoabczxyuvwabc123456abczxydefgh
替换为:mnoabczxyuvm(9,3,1)23456(18,6,d)efgh
。mnoabczxyuvm(9,3)123456(18,6)defgh
。压缩格式分两个文件保存:
Huffman编码保证了二叉树的带权路径长度最小,提高了压缩率
压缩文件中除了要保存压缩数据,还必须保存解压缩需要用到的信息:
文件类型 | 源文件大小 | 压缩大小 | 压缩率 |
---|---|---|---|
视频/图片文件 | 10.3MB | 10.3MB | 100% |
文本文件 | 12.0MB | 8.51MB | 70.9% |
详细代码请前往GitHub—>点我啊
#pragma once
#include
#include
#include"Huffman.hpp"
class FileCompressHuffman {
public:
FileCompressHuffman();
void CompressFile(const std::string& fileName);
void UnCompressFile(const std::string& fileName);
private:
void ReadHead(FILE* pRead, std::vector<int>& count);
void WriteHead(FILE* pWrite, const std::vector<int>& count);
void GetHuffmanCode(HuffManTreeNode* ptr, std::vector<std::string>& charPassWord);
};
#pragma once
#include"Common.hpp"
class HashTable {
public:
HashTable(USH size);
~HashTable();
void Insert(USH& matchhead, UCH ch, USH pos, USH& hashAddr);
void hashFunc(USH& hashAddr, UCH ch);
USH GetNext(USH matchHead);
void Update();
private:
USH H_SHIFT();
private:
USH *prev_;
USH *head_;
};
#pragma once
#include
#include
#include
struct HuffManTreeNode {
HuffManTreeNode(unsigned long long weight, unsigned char ch = 0)
:pLeft_(nullptr)
, pRight_(nullptr)
, Weight_(weight)
, Ch_(ch)
{}
HuffManTreeNode *pLeft_;
HuffManTreeNode *pRight_;
unsigned long long Weight_; //权值
unsigned char Ch_; //待压缩字符
};
class Less { //使用小堆
public:
bool operator()(const HuffManTreeNode* pLeft, const HuffManTreeNode* pRight) {
return pLeft->Weight_ > pRight->Weight_;
}
};
class HuffmanTree {
typedef HuffManTreeNode Node;
typedef HuffManTreeNode* PNode;
public:
HuffmanTree(const std::vector<int> arr)
:pRoot_(nullptr)
{
CreateHuffmanTree(arr);
}
~HuffmanTree() {
std::stack<PNode> sa;
PNode ptr = pRoot_;
while (!sa.empty() || ptr) {
while (ptr) {
sa.push(ptr);
ptr = ptr->pLeft_;
}
PNode del = sa.top();
sa.pop();
ptr = del->pRight_;
delete del;
}
}
PNode GetRoot() {
return pRoot_;
}
private:
void CreateHuffmanTree(const std::vector<int> count) {
//优先队列,小根堆
std::priority_queue<PNode, std::vector<PNode>, Less> que;
for (int i = 0; i < 256; ++i) {
if (count[i] > 0) { //筛选出现过的字符,压入队列
que.push(new Node(count[i], i));
}
}
while (que.size() > 1) {
PNode left = que.top();
que.pop();
PNode right = que.top();
que.pop();
PNode newNode = new Node(left->Weight_ + right->Weight_);
newNode->pLeft_ = left;
newNode->pRight_ = right;
que.push(newNode);
}
pRoot_ = que.top();
}
private:
Node* pRoot_;
};
#pragma once
#include
#include"HashTable.hpp"
class LZ77 {
public:
LZ77();
~LZ77();
void CompressionFile(const std::string& fileName);
void UnCompressionFile(const std::string& fileName);
private:
void MergeFile(FILE* fW, ULL fileSize);
void fillWindow(USH& start, FILE* fR, size_t& readSize);
USH LongestMatch(USH matchHead, USH &curMatchDist, USH start);
void WriteFlag(FILE* file, UCH& chNum, UCH& bitCount, bool isLen);
private:
UCH* pWin_;
HashTable ht_;
std::string fileName_;
};
#pragma once
typedef unsigned char UCH;
typedef unsigned short USH;
typedef unsigned long long ULL;
const USH MIN_MATCH = 3; //最小匹配长度
const USH MAX_MATCH = 258; //最大匹配长度
const USH WSIZE = 32 * 1024; //32k
#define _CRT_SECURE_NO_WARNINGS 1
#include"FileCompressHuffman.hpp"
#include
FileCompressHuffman::FileCompressHuffman(){}
void FileCompressHuffman::CompressFile(const std::string& fileName) {
FILE *pFile = fopen(fileName.c_str(), "rb");//因为这里压缩的不一定是文本文件
//用二进制方式打开更为合理
if (!pFile) {
std::cout << "open file " << fileName << " error!" << std::endl;
return;
}
//1、统计源文件中每个字符出现的次数
unsigned char buf[1024] = { 0 }; //一次读取文件的1024个字节
//这里不能出现负数,因为无法作为count数组的下标,所以要用unsigned char
int rdSize = 0;
std::vector<int> count;
count.resize(256);
while (1) {
rdSize = fread(buf, 1, 1024, pFile);
for (int i = 0; i < rdSize; ++i) {
++count[buf[i]];
}
if (rdSize < 1024)
break;
}
//2、以字符出现的次数为权值创建huffman树
HuffmanTree tree(count);
//3、获取每个字符的编码
std::vector<std::string> strCode;
strCode.resize(256);
GetHuffmanCode(tree.GetRoot(), strCode);
//4、用获取到的编码重新改写源文件
FILE *pWrite = fopen("Huffman.bin", "wb");
if (!pWrite) {
std::cout << "open Huffman.bin error!" << std::endl;
return;
}
//写入头部信息
WriteHead(pWrite, count);
fseek(pFile, 0, SEEK_SET);
unsigned char ch = 0;
int bitCount = 0;
while (1) {
rdSize = fread(buf, 1, 1024, pFile);
for (int i = 0; i < rdSize; ++i) {
std::string passWord = strCode[buf[i]];//读取buf[i]对应的密码
for (size_t j = 0; j < passWord.size(); ++j) {
ch <<= 1;
++bitCount;
if (passWord[j] == '1') {
ch |= 1;
}
if (bitCount == 8) {
fputc(ch, pWrite);
bitCount = 0;
ch = 0;
}
}
}
if (rdSize < 1024)
break;
}
if(bitCount > 0){
ch = ch << (8 - bitCount);
fputc(ch, pWrite);
}
fclose(pFile);
fclose(pWrite);
}
void FileCompressHuffman::UnCompressFile(const std::string& fileName) {
FILE *pRead = fopen(fileName.c_str(), "rb");
if (!pRead) {
std::cout << "open file " << fileName << " error!" << std::endl;
return;
}
//读取头部信息
std::vector<int> count;
count.resize(256);
ReadHead(pRead, count);
//创建Haffman树
HuffmanTree tree(count);
FILE *pWrite = fopen("h2.bin", "wb");
if (pWrite == NULL) {
std::cout << "open h2.bin error!" << std::endl;
return;
}
unsigned char ch = 0;
HuffManTreeNode* ptr = tree.GetRoot();
unsigned long long fileSize = ptr->Weight_;
char readBuf[1024] = { 0 };
while (1) {
int n = fread(readBuf, 1, 1024, pRead);
if (!n)
break;
for (int i = 0; i < n; ++i) {
ch = readBuf[i];
for (int j = 0; j < 8; ++j) {
if (ch & 0x80) {//最高位为1
ptr = ptr->pRight_;
}
else {
ptr = ptr->pLeft_;
}
ch = ch << 1;
if (!ptr->pLeft_ && !ptr->pRight_) {//已经找到叶子节点
fputc(ptr->Ch_, pWrite);
--fileSize;
ptr = tree.GetRoot();
if (!fileSize)
goto LOOP;
}
}
}
}
LOOP:
fclose(pRead);
fclose(pWrite);
}
void FileCompressHuffman::ReadHead(FILE* pRead, std::vector<int>& count){
char buf[1024] = { 0 };
//读取行数信息
fgets(buf, 1024, pRead);
int rows = atoi(buf);
for (int i = 0; i < rows; ++i) {
fgets(buf, 1024, pRead);
if (buf[0] == '\n') {
fgets(buf + 1, 1023, pRead);
}
unsigned char ch = buf[0];
int ret = atoi(buf + 2);
count[ch] = ret;
}
}
void FileCompressHuffman::WriteHead(FILE* pWrite, const std::vector<int>& count) {
std::string headstr = "";
int rows = 0;
char str[1024] = { 0 };
for (int i = 0; i < 256; ++i) {
if (count[i] > 0) {
unsigned char ch = i;
int ret = count[i];
//sprintf(str, "%c:%d\n", ch, ret);
//headstr += str;
headstr += ch;
headstr += ':';
_itoa(ret, str, 10);
headstr += str;
headstr += '\n';
++rows;
}
}
sprintf(str, "%d\n", rows);
headstr = str + headstr;
fwrite(headstr.c_str(), 1, headstr.size(), pWrite);
}
void FileCompressHuffman::GetHuffmanCode(HuffManTreeNode* ptr, std::vector<std::string>& charPassWord) {
static std::string passWord = "";
if (!ptr->pLeft_ && !ptr->pRight_) {//当它为叶子节点时
charPassWord[ptr->Ch_] = passWord;
return;
}
if (ptr->pLeft_) {
passWord += '0';
GetHuffmanCode(ptr->pLeft_, charPassWord);
passWord.erase(passWord.end() - 1);
}
if (ptr->pRight_) {
passWord += '1';
GetHuffmanCode(ptr->pRight_, charPassWord);
passWord.erase(passWord.end() - 1);
}
}
#include
#include"HashTable.hpp"
const USH HASH_BITS = 15; //哈希地址15位
const USH HASH_SIZE = (1 << HASH_BITS); //哈希地址个数 32K
const USH HASH_MASK = HASH_SIZE - 1; //防止溢出 低15位全1
HashTable::HashTable(USH size)
:prev_(new USH[2*size])
,head_(prev_+size)
{
memset(prev_, 0, 2 * size * sizeof(USH));
}
HashTable::~HashTable() {
delete[] prev_;
prev_ = nullptr;
}
void HashTable::Insert(USH& matchhead, UCH ch, USH pos, USH& hashAddr) {
hashFunc(hashAddr, ch);//获取本次插入的哈希地址
matchhead = head_[hashAddr];//获取上一次匹配的字符串头
//将新的哈希地址插入链表
prev_[pos&HASH_MASK] = head_[hashAddr];
head_[hashAddr] = pos;
}
USH HashTable::GetNext(USH matchHead) {
return prev_[matchHead&HASH_MASK];
}
void HashTable::Update() {
for (size_t i = 0; i < WSIZE; ++i) {
//更新head
if (head_[i] > WSIZE)
head_[i] -= WSIZE;
else
head_[i] = 0;
//更新prev
if (prev_[i] > WSIZE)
prev_[i] -= WSIZE;
else
prev_[i] = 0;
}
}
void HashTable::hashFunc(USH& hashAddr, UCH ch) {
hashAddr = (((hashAddr) << H_SHIFT()) ^ (ch))&HASH_MASK;
}
USH HashTable::H_SHIFT() {
return (HASH_BITS + MIN_MATCH - 1) / MIN_MATCH;
}
#define _CRT_SECURE_NO_WARNINGS 1
#include
#include"LZ77.hpp"
const USH MIN_LOOKAHEAD = MAX_MATCH + MIN_MATCH + 1; //要保证最后一次匹配,最大匹配长度258
const USH MAX_DIST = WSIZE - MIN_LOOKAHEAD; //最长匹配距离
LZ77::LZ77()
:pWin_(new UCH[WSIZE * 2])
,ht_(WSIZE)
{}
LZ77::~LZ77() {
delete[] pWin_;
pWin_ = nullptr;
}
void LZ77::CompressionFile(const std::string& fileName) {
fileName_ = fileName;
FILE* fR = fopen(fileName.c_str(), "rb");
if (!fR) {
std::cout << "待压缩文件打开失败!" << std::endl;
return;
}
//计算文件大小
fseek(fR, 0, SEEK_END);
ULL fileSize = ftell(fR);
if (fileSize <= MIN_MATCH) {
std::cout << "文件太小!不进行压缩!!" << std::endl;
return;
}
//将文件指针置回起始位置
fseek(fR, 0, SEEK_SET);
//从压缩文件中读取一个缓冲区的数据到窗口中
size_t lookAhead = fread(pWin_, sizeof(UCH), 2 * WSIZE, fR);
//计算前两个字符的哈希地址
USH hashAddr = 0;
for (UCH i = 0; i < MIN_MATCH - 1; ++i) {
ht_.hashFunc(hashAddr, pWin_[i]);
}
FILE* fW = fopen("LZ77.bin", "wb");//写压缩数据
FILE* fWT = fopen("3.bin", "wb");//写数据的标记
if (!fW || !fWT) {
std::cout << "文件打开失败" << std::endl;
return;
}
USH matchHead = 0;//匹配链的头
USH curMatchLen = 0; //最长匹配链的长度
USH curMatchDist = 0; //最长匹配链的距离
USH start = 0; //查找字符串在缓冲区的地址
UCH chNum = 0; //将要写入的标记
UCH bitCount = 0; //记录 标记写了多少位
while (lookAhead) {
//1.将当前三个字符插入到哈希表中,并获取匹配链的头
ht_.Insert(matchHead, pWin_[start + 2], start, hashAddr);
curMatchLen = 0;
curMatchDist = 0;
//2.验证在查找缓冲区中是否找到匹配,如果有匹配,找最长匹配
if (matchHead > 0) {
//顺着匹配链找最长匹配,最终带出<长度,距离>对
curMatchLen = LongestMatch(matchHead, curMatchDist, start);
}
//3.验证是否找到匹配
if (curMatchLen < MIN_MATCH) {//找到
//写原字符
fputc(pWin_[start], fW);
//写标记
WriteFlag(fWT, chNum, bitCount, false);
++start;
--lookAhead;
}
else { //未找到
//写长度
UCH chlen = curMatchLen - 3;
fputc(chlen, fW);
//写距离
fwrite(&curMatchDist, sizeof(curMatchDist), 1, fW);
//写标记
WriteFlag(fWT, chNum, bitCount, true);
lookAhead -= curMatchLen;
//将已经匹配的字符串按照三个一组将其插入到哈希表中
++start; //第一个字符已经插入
--curMatchLen;
while (curMatchLen) {
ht_.Insert(matchHead, pWin_[start + 2], start, hashAddr);
++start;
--curMatchLen;
}
}
//检测先行缓冲区中剩余字符个数
if (lookAhead <= MIN_LOOKAHEAD)
fillWindow(start, fR, lookAhead);
}
//将标记位数不够八位的写入
if (bitCount > 0 && bitCount < 8) {
chNum <<= (8 - bitCount);
fputc(chNum, fWT);
}
fclose(fWT);
fclose(fR);
//合并压缩数据文件和标记文件
MergeFile(fW, fileSize);
fclose(fW);
//将用来保存标记信息的临时文件删除掉
if (remove("3.bin") != 0) {
std::cout << "3.bin删除失败" << std::endl;
}
}
void LZ77::MergeFile(FILE* fW, ULL fileSize) {
//将压缩数据文件和标记信息文件合并
//读取标记信息文件中内容,然后将结果写入到压缩文件中
FILE* fR = fopen("3.bin", "rb");
UCH *buff = new UCH[1024];
ULL rSize = 0;
while (1) {
size_t readSize = fread(buff, sizeof(UCH), 1024, fR);
if (readSize == 0)
break;
rSize += readSize;
fwrite(buff, sizeof(UCH), readSize, fW);
}
fwrite(&rSize, sizeof(rSize), 1, fW);
fwrite(&fileSize, sizeof(fileSize), 1, fW);
delete[] buff;
fclose(fR);
}
void LZ77::fillWindow(USH& start, FILE* fR, size_t& readSize) {
//start压缩已经进行到右窗,先行缓冲区剩余数据不够MIN_LOOKAHEAD
if (start >= WSIZE) {
//1.将右窗中的数据搬移到左窗
memcpy(pWin_, pWin_ + WSIZE, WSIZE);
memset(pWin_ + WSIZE, 0, WSIZE);
start -= WSIZE;
//2.更新哈希表
ht_.Update();
//3.向右窗中补充WSIZE个的待压缩数据
if (!feof(fR))
readSize += fread(pWin_ + WSIZE, sizeof(UCH), WSIZE, fR);
}
}
USH LZ77::LongestMatch(USH matchHead, USH& MatchDist, USH start) { //找最长匹配
USH curMatchLen = 0;
USH maxMatchLen = 0;
USH maxMatchHead = 0;
UCH matchCount = 255;
//在先行缓冲区中查找匹配时,不能太远即不能超过MAX_DIST
USH limit = start > MAX_DIST ? start - MAX_DIST : 0;
do {
//最大匹配范围
UCH* pStart = pWin_ + start;
UCH* pEnd = pStart + MAX_MATCH;
//查找缓冲区匹配串的起始
UCH* ptr = pWin_ + matchHead;
curMatchLen = 0;
while (pStart < pEnd&&*pStart == *ptr) {
++curMatchLen;
++pStart;
++ptr;
}
if (maxMatchLen < curMatchLen) {
maxMatchLen = curMatchLen;
maxMatchHead = matchHead;
}
} while ((matchHead = ht_.GetNext(matchHead)) > limit&&matchCount--);
//获取最大匹配距离
MatchDist = start - maxMatchHead;
//获取最大匹配长度
return maxMatchLen;
}
void LZ77::WriteFlag(FILE* file, UCH& chNum, UCH& bitCount, bool isLen) {
chNum <<= 1;
if (isLen)
chNum |= 1;
++bitCount;
if (bitCount == 8) {
fputc(chNum, file);
bitCount = 0;
chNum = 0;
}
}
void LZ77::UnCompressionFile(const std::string& fileName) {
FILE* fR = fopen(fileName.c_str(), "rb"); //读取压缩数据
FILE* fRT = fopen(fileName.c_str(), "rb"); //读取标记
if (!fR||!fRT) {
std::cout << "压缩文件打开失败!" << std::endl;
return;
}
ULL fileSize = 0; //读取压缩数据大小
fseek(fRT, 0 - sizeof(fileSize), SEEK_END);
fread(&fileSize, sizeof(fileSize), 1, fRT);
ULL flagSize = 0; //读取标记文件大小
fseek(fRT, 0 - sizeof(fileSize) - sizeof(flagSize), SEEK_END);
fread(&flagSize, sizeof(flagSize), 1, fRT);
//将文件指针指向标记文件起始
fseek(fRT, 0 - sizeof(fileSize) - sizeof(flagSize) - flagSize, SEEK_END);
std::string newFile = "new" + fileName_;
FILE* fW = fopen(newFile.c_str(), "wb"); //将解压后的数据写入到新文件
FILE* fWr = fopen(newFile.c_str(), "rb"); //读取新文件已写入的部分
if (!fW||!fWr) {
std::cout << "新文件打开/读取失败" << std::endl;
return;
}
UCH chNum = 0;
UCH bitCount = 0;
ULL enCodeCount = 0;
while (enCodeCount < fileSize) {
//读取标记信息
if (bitCount == 0) {
chNum = fgetc(fRT);
bitCount = 8;
}
if (chNum & 0x80) {//是长度数据
//读取长度
USH strLength = fgetc(fR) + 3;
//读取距离
USH strDist = 0;
fread(&strDist, sizeof(strDist), 1, fR);
//清空缓冲区
fflush(fW);
enCodeCount += strLength;
fseek(fWr, 0 - strDist, SEEK_END);
UCH ch = 0;
while (strLength) { //fR:读取前文匹配串中的内容
ch = fgetc(fWr);
fputc(ch, fW);
//在还原长度距离对时,一定要清空缓冲区,否则可能会还原出错
fflush(fW);
--strLength;
}
}
else {//原始字符
UCH ch = fgetc(fR);
fputc(ch, fW);
fflush(fW);
++enCodeCount;
}
chNum <<= 1;
--bitCount;
}
fclose(fR);
fclose(fRT);
fclose(fW);
fclose(fWr);
if (remove(fileName.c_str()) != 0) {
std::cout << fileName << "删除失败" << std::endl;
}
}
#define _CRT_SECURE_NO_WARNINGS 1
#include"FileCompressHuffman.hpp"
#include"FileCompressHuffman.cpp"
#include"LZ77.hpp"
void test() {
LZ77 lz;
FileCompressHuffman tree;
lz.CompressionFile("IMG_5725.PNG");
tree.CompressFile("LZ77.bin"); //指定要压缩的文件
tree.UnCompressFile("Huffman.bin"); //指定要解压的文件
lz.UnCompressionFile("h2.bin");
}
int main() {
test();
_CrtDumpMemoryLeaks();
system("pause");
return 0;
}