QQ群聊天记录文件分割

 嗯,如题 是个蛋疼物
目前QQ的聊天记录导出功能很让人郁闷
三种聊天记录格式的导出
1  TXT   没图
2  BAK  只能再导入QQ使用
3  MHT 有图有字,缺点是一旦聊天记录很多,文件体积就会很大,几乎所有的工具都不能正常打开


单纯的把MHT转换成HTML也不行,因为HTML也很大,加上图片之类的资源 也会卡死
于是只能切开显示,处理思路很简单,就是超大的文本文件,按行顺序处理,把图片解码存入文件,然后分割HTML内容
代码如下 只支持单个QQ群导出记录

  1 import io;

  2 import fsys.dlg;

  3 import string;

  4 import crypt.bin;

  5 import fsys.path;

  6 //将Base64的数据转换成图片

  7 function base64images (str,path)

  8 {

  9     if(str)

 10     {

 11         var kzm = string.match(str,"Content-Type:image/(\S+)");

 12         var wjm = string.match(str,"Content-Location\:(\S+.dat)");

 13         startpos,endpos = string.find(str,"}.dat");

 14         if(endpos)

 15         {

 16             var bindata = crypt.bin.decodeBase64(string.trim( string.sub(str,endpos+1)));

 17             //io.print(string.trim( string.sub(str,endpos)));

 18             //execute("pause") //按任意键继续

 19             string.save(path ++ "\\" ++ wjm,bindata);

 20         }

 21         

 22     }

 23 }

 24 //切割记录文件

 25 function split_html (file_path,path)

 26 {

 27     var html_head = '<html xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /><title>QQ Message</title><style type="text/css">body{font-size:12px; line-height:22px; margin:2px;}td{font-size:12px; line-height:22px;}</style></head><body><table width=100% cellspacing=0>';

 28     var html_foot = '</body></html>';

 29     var file = io.open( file_path, "rt" );

 30     var line_text = file.read();

 31     var line = 0;

 32     var page = 0;

 33     var tmp = '';

 34     while(line_text)

 35     {

 36         line_text = string.match(line_text,"\<tr.*?\</tr\>");

 37         if(line_text)

 38         {

 39             tmp = tmp ++ string.replace(line_text,'@<IMG src="{','<IMG src="images/{');

 40             line++;

 41             if( line>500 )

 42             {

 43                 var f = io.open(path ++page++".html","w+");

 44                 f.write(html_head);

 45                 f.write(tmp);

 46                 f.write('</table>'++'<a href="'++ page+1 ++'.html">Next page</a>');

 47                 f.write(html_foot);

 48                 f.close();  

 49                 page++;

 50                 line = 0;

 51                 tmp = '';

 52             }

 53         }

 54         line_text = file.read();

 55     }

 56     var f = io.open(path ++ page ++ ".html","w+");

 57     f.write(html_head);

 58     f.write(tmp);

 59     f.write('</table>');

 60     f.write(html_foot);

 61     f.close();  

 62     file.close();

 63 }

 64 //打开控制台

 65 io.open();

 66 //选择QQ聊天记录

 67 var mhtPath = fsys.dlg.open("QQ聊天记录文件*.mht|*.mht||");

 68 //开始处理文件

 69 if(mhtPath)

 70 {

 71     var path = io.splitpath(mhtPath);

 72     var img_dir = path.dir ++ path.name ++ '\\images';

 73     var file = io.open( mhtPath, "rt" );

 74     fsys.createDir( path.dir ++ path.name);

 75     fsys.createDir( path.dir ++ path.name ++ '\\images');

 76     var html = io.open( path.dir ++ path.name ++ "\\tmp.html","a+");

 77     var line_text = file.read();

 78     var i = 0; //行数

 79     var is_html = false;

 80     var tmp_image_data = '';

 81     //判断是否是腾讯QQ聊天记录文件

 82     if(string.indexAny(line_text,"Tencent"))

 83     {

 84         

 85         while( line_text  )

 86         { 

 87             //判断文本内容开始处

 88             html_start,xmlns_end = string.find(line_text,"@<html xmlns");

 89             if( 1 == html_start )

 90             {

 91                 is_html = true;

 92             }

 93             //判断文本内容结束处

 94             html_end1,html_end2 = string.find(line_text,"@</table></body></html>");

 95             if( 1 == html_end1 )

 96             {

 97                 //最后一行写入文件

 98                 html.write(line_text);

 99                 html.write('\r\n');

100                 is_html = false;

101                 //break;

102             }

103             

104             if(is_html)

105             {   //将聊天文本内容写入文件

106                 html.write(line_text);

107                 html.write('\r\n');

108             }

109             

110             //切割图片base64数据 

111             if(false == is_html && i>10 )

112             {

113                 if(string.find(line_text,"@------=_NextPart")){

114                     base64images(tmp_image_data,img_dir);

115                     tmp_image_data = line_text ++ '\r\n';

116                 }else{

117                     tmp_image_data = tmp_image_data ++ line_text ++ '\r\n';

118                 }

119             }

120             

121             line_text = file.read();

122             i++;

123             io.print("已处理",i,'行数据');

124         }

125         base64images(tmp_image_data,img_dir);

126         

127     }else {

128     

129         io.print("您选择的文件可能不是QQ导出的mht聊天记录文件");

130         

131     }

132     //关闭文件

133     file.close();

134     html.close();

135     //需要切割html

136     split_html( path.dir ++ path.name ++ "\\tmp.html",path.dir ++ path.name ++ "\\");

137 }

138 else

139 {

140     io.print("请正确的选择QQ导出的mht聊天记录文件");

141 }

142 execute("pause") //按任意键继续

143 io.close();//关闭控制台

 



二进制版本

 

 

 

 

 

 

 

 

 

 

PHP版本的(可能有点小BUG 不确定 效率提升不少)

 1 <?php

 2 if(!file_exists($argv[1])){

 3     echo 'There isn\'t have this file.';

 4     exit;

 5 }

 6 ini_set('pcre.backtrack_limit',100000000);

 7 #ini_set('pcre.recursion_limit',100000000);

 8 define("BASEDIR",dirname($argv[1]).'/');

 9 $is_table_end = false;

10 $page = 0;

11 mk_imgdir();

12 $handle = fopen("input.mht", "rb");

13 $contents = '';

14 while (!feof($handle)) {

15     $contents .=fread($handle,204800);

16     $contents = process($contents);

17 }

18 process($contents);

19 

20 #主体数据处理

21 function process($contents){

22     $html_head = '<html xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /><title>QQ Message</title><style type="text/css">body{font-size:12px; line-height:22px; margin:2px;}td{font-size:12px; line-height:22px;}</style></head><body><table width=100% cellspacing=0>';

23     $html_foot = '</table></body></html>';

24     Global $is_table_end;

25     Global $page;

26     if(false === $is_table_end){

27         if( false !== strpos($contents,$html_foot) ){

28             $is_table_end = true;//判断消息部分是否完毕 只处理图片部分

29         }

30         $r = preg_match_all ('|<tr.*?\</tr\>|ims',  $contents , $matches ,PREG_OFFSET_CAPTURE);

31         if($r){

32             $matches = array_chunk($matches[0],200);

33             foreach($matches as $key=>$val){

34                 $arr = array_column($val,0);

35                 array_walk( $arr , function(&$v, $k) { $v = preg_replace('|<IMG src="{(\S)(\S)(\S+).dat|ims','<IMG src="../images/$1/$2/{$1$2$3.dat',$v);});

36                 file_put_contents('messages/'.sprintf("%08d", $page+$key).'.html' ,$html_head . implode('',$arr).'<td><H1><a href="./'.sprintf("%08d", $page+$key+1).'.html">Next page</a></h1></td>'.$html_foot);

37             }

38             $page += $key; 

39             $pos = end($val);

40             $contents = substr($contents,bcadd($pos[1],strlen($pos[0]),0));

41         }else{

42             $contents = '';

43         }

44     }

45     if(true === $is_table_end){

46         //图片部分

47         $r = preg_match_all ('|Content-Type:image.*?:base64.*?Content-Location:(.*?)\.dat(.*?)(?:------=_)|ims',  $contents , $matches ,PREG_OFFSET_CAPTURE | PREG_SET_ORDER);

48         if($r){

49             //$matches = array_chunk($matches[0],200);

50             $result = array();

51             foreach($matches as $key=>$val){

52                 $result[] = array('name'=>$val[1][0],'contents'=>$val[2][0]);

53                 if(count($result) >=150){

54                     put_images($result);

55                     $result = array();

56                 }

57             }

58             put_images($result);

59             $result = array();

60             $contents = substr($contents,$val[0][1]); ##There is no str len

61         }else{

62             $contents = '';

63         }

64     }

65     return $contents;

66 }

67 

68 

69 #写入图片表情文件到硬盘

70 function put_images($data){

71     foreach($data as $val){

72         $dir = './images/'. substr($val['name'],1,1) .'/' . substr($val['name'],2,1) .'/'.$val['name'] . '.dat';

73         echo $dir."\r\n";

74         file_put_contents($dir,base64_decode(trim($val['contents'])));

75     }

76 }

77 

78 #建立图片保存目录

79 function mk_imgdir(){

80     if(file_exists(BASEDIR.'images')){

81         rename(BASEDIR.'images',BASEDIR.'images_old');

82     }

83     $tmp = array('0','1','2','3','4','5','6','7','8','9','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z');

84     foreach($tmp as $v1){

85         mkdir(BASEDIR.'images'.'/'.$v1,0777,true);

86         foreach($tmp as $v2){

87             mkdir(BASEDIR.'images'.'/'.$v1.'/'.$v2,0777,true);

88         }

89     }

90     if(file_exists(BASEDIR.'messages')){

91         rename(BASEDIR.'messages',BASEDIR.'messages_old');

92     }

93     mkdir(BASEDIR.'messages'.'/',0777,true);

94     return 0;

95 }

使用方法

1 "php.exe" cli.php input.mht > log.txt

2 pause

 

你可能感兴趣的:(qq)