*=============================================================== * Copyright (C) 2013 All rights reserved. * * 文件名称:StringProcess.cpp * 创 建 者: * 创建日期:2013年04月24日 * 描 述: * 备 注: * 更新日志: * ================================================================*/ #include<stdio.h> #include<string.h> #include<stdlib.h> #include <sys/time.h> #include<ctype.h> #include<locale.h> #include "boost/regex.hpp" #include <iconv.h> #include <errno.h> #include<algorithm> // please add your code here! using namespace std; #define MAX_LINE_LENGTH 1048576 #define TAGLEN 50 /************************************************************ * @brief <funcName:trim> Author:刘禹 finallyly 20130425 去掉字符串首尾空格 ================================================== * @param s ================================================== **********************************************************/ void trim(char *s) { char *start; char *end; int len=strlen(s); start=s; end=s+len-1; while(1) { char c=*start; if(!isspace(c)) { break; } start++; if(start>end) { s[0]='\0'; return ; } } while(1) { char c=*end; if(!isspace(c)) { break; } end --; if(start>end) { s[0]='\0'; return; } } memmove(s,start,end-start+1); s[end-start+1]='\0'; return; } inline bool strTolower( char* str ) { if ( !str ) return false; int i = 0; bool flag = true; while ( str[i] ) { if ( 'A' <= str[i] && 'Z' >= str[i] ) { str[i] += 32; } else if ( 'a' <= str[i] && 'z' >= str[i] ) { } else { flag = false; } ++i; } return flag; } /************************************************************ * @brief <funcName:> Author:刘禹 finallyly * 从系统默认的汉字编码本机是GBK转unicode,宽字符保存 ================================================== * @param sToMatch ================================================== * @return **********************************************************/ wstring String2Wstring(string sToMatch) { wstring wsToMatch; setlocale( LC_CTYPE, "" ); // 很重要,没有这一句,转换会失败。 int iWLen = mbstowcs( NULL, sToMatch.c_str(), sToMatch.length() ); // 计算转换后宽字符串的长度。(不包含字符串结束符) if(iWLen>0) { wchar_t *lpwsz = new wchar_t[iWLen + 1]; int i = mbstowcs( lpwsz, sToMatch.c_str(), sToMatch.length() ); // 转换。(转换后的字符串有结束符) wsToMatch.assign(lpwsz); delete []lpwsz; } else { wsToMatch=L""; } return wsToMatch; } /************************************************************ * @brief <funcName:> Author:刘禹 finallyly * Unicode转系统自带编码,用于输出 ================================================== * @param sToMatch ================================================== * @return **********************************************************/ string Wstring2String(wstring sToMatch) { string sResult; int iLen = wcstombs( NULL, sToMatch.c_str(), 0 ); // 计算转换后字符串的长度。(不包含字符串结束符) if(iLen>0) { char *lpsz = new char[iLen + 1]; int i = wcstombs( lpsz, sToMatch.c_str(), iLen ); // 转换。(没有结束符) lpsz[iLen] = '\0'; sResult.assign(lpsz); delete []lpsz; } else { sResult=""; } return sResult; } /************************************************************ * @brief <funcName:> Author:刘禹 finallyly * 从指定编码转换到目标编码 ================================================== * @param toCode ================================================== * @param fromCode ================================================== * @param srcstr ================================================== * @param deststr ================================================== * @param srclen ================================================== * @param destlen ================================================== * @return **********************************************************/ int toAnotherCode(const char *toCode,const char *fromCode,char *srcstr, char *deststr, size_t srclen,size_t &destlen) { iconv_t convertor=iconv_open(toCode,fromCode); size_t inputsize; size_t outputsize; size_t oldoutputsize; char *input, *inputold; char *output=NULL; char *outputold=NULL; int flag=0; if(convertor==iconv_t(-1)) { fprintf(stderr,"convertor device initailization failed!\n"); return 1; } else { inputsize=srclen; input=new char[inputsize+1]; memcpy(input,srcstr,inputsize); input[inputsize]='\0'; inputold=input; outputsize=inputsize*5; oldoutputsize=outputsize; output=new char[outputsize]; output[0]=0; outputold=output; size_t rc = iconv(convertor,&input,&inputsize,&output,&outputsize); memcpy(deststr,outputold,oldoutputsize-outputsize); deststr[destlen]=0; destlen=oldoutputsize-outputsize; if(rc>0) { flag=1; } delete []inputold; delete []outputold; } iconv_close(convertor); if(flag==1) { return 0; } else { return 1; } } /************************************************************ * @brief <funcName:PrintUsage> Author:刘禹 finallyly 20130424 ================================================== **********************************************************/ void PrintUsage() { fprintf( stderr, "prog [IN]hzpylist_file [IN]input_file [OUT]output_file [OUT]errdmp_file\n" ); } void testRegex() { string s="刘禹,刘德华,刘佳佳。。。王大虎。。。刘长春,xixi"; string t="刘[^刘]*?,"; wstring p=String2Wstring(t); wstring ws=String2Wstring(s); boost::wregex wreg(p,boost::regbase::icase|boost::regex::perl); boost::wsmatch wm; vector<string> results; wstring::const_iterator it=ws.begin(); wstring::const_iterator end=ws.end(); while(boost::regex_search(it,end,wm,wreg)) { wstring wtemp=wm[0]; string temp=Wstring2String(wtemp); results.push_back(temp); it=wm[0].second; } fprintf(stdout,"输出正则匹配结果\n"); for(vector<string>::iterator it=results.begin();it!=results.end();it++) { printf("%s\n",(*it).c_str()); } } int LoadFile(char* inputfile) { FILE *fin = NULL; char line[102400] = {0}; char word[102400] = {0}; int len = 0; fin = fopen(inputfile, "r"); if (NULL == fin) { fprintf(stderr,"LoadAddress can not open inputfilename %s\n", inputfile); return 1; } while(true) { fgets(line, 102400, fin); if (feof(fin)) { break; } len = strlen(line); if (0 == line[0] || '\n' != line[len - 1]) { continue; } line[len - 1] = 0; string pattern ="首都或首府:"; string p1="([\u2E80-\u9FFF])+"; wstring wp1 = String2Wstring(p1); //wstring wpattern = L"([\u2E80-\u9FFF])+"; wstring wpattern = L"([\u2E80-\u9FFF]+)"+String2Wstring(pattern)+L"([\u2E80-\u9FFF]+)"; wstring winputstr = String2Wstring(line); boost::wregex wreg(wpattern, boost::regex::perl|boost::regbase::icase); boost::smatch what; boost::wsmatch wswhat; wstring::const_iterator wstrit = winputstr.begin(); wstring::const_iterator wstrend = winputstr.end(); while (boost::regex_search(wstrit, wstrend, wswhat, wreg)) { wstring ws1 = wswhat[1]; wstring ws2 = wswhat[2]; string s1 = Wstring2String(ws1); string s2 = Wstring2String(ws2); fprintf(stdout, "%s\t%s\n", s1.c_str(), s2.c_str()); wstrit=wswhat[0].second; } } if (NULL != fin) { fclose(fin); fin = NULL; } return 0; } int main( int argc, char *argv[] ) { timeval tv1, tv2; gettimeofday(&tv1, NULL); if ( 2 != argc ) { PrintUsage(); return 1; } LoadFile(argv[1]); gettimeofday(&tv2, NULL); fprintf(stderr,"%s has finished congratulations!\n",argv[0]); fprintf( stderr,"time elapsed: %.2f ms\n", (float)((tv2.tv_sec - tv1.tv_sec)*1000000+(tv2.tv_usec-tv1.tv_usec))/1000); return 0; }