【原型】根据url将目标网页打包成mht

其流程是这样的

1:根据URL获取目标网页HTML
2:剔除脚本
3:提取图片地址,仅处理http开头的地址
4:处理原HTML,把原来的img src内容替换掉,对给一个id标示,为后面的脚本服务
5:抓如图片,并处理成base64编码,组合成xmp文件头
6:组合HTML代码,保存成物理文件


这只是个原型,实际上要真正完整抓取,还要解析html中的css文件,js文件。
还有路径处理,我这里只处理了http开头的完整url,相对路径也是需要处理的。


本文的JS来源于下帖中的98楼:
http://topic.csdn.net/u/20100127/14/A961DF9A-34A1-44F1-9F9A-C10A4D168736.html

下面是原型函数:

 

/// <summary> /// 将url对应的网页保存成mht文件 /// </summary> /// <param name="url1">目标地址</param> /// <returns>返回一个图片地址列表字串</returns> public string getRemotePage(string url1) { ////创建 代理服务器设置对象 的实例(如果使用代理服务器的话) //System.Net.WebProxy wp = new System.Net.WebProxy("IP:PORT"); ////代理服务器需要验证 //wp.BypassProxyOnLocal = false; ////用户名密码 //wp.Credentials = new NetworkCredential("USERNAME", "PWAASORD"); ////将代理服务器设置对象赋予全局设定 //System.Net.GlobalProxySelection.Select = wp; System.Net.WebClient wc = new System.Net.WebClient(); Byte[] pageData = wc.DownloadData(url1); string PageHTML = System.Text.Encoding.Default.GetString(pageData); string oneImage=""; string AllImages = ""; string MHTHeader = ""; string MHTJS = ""; string MHTJSCommon = ""; int i; int z=0; System.Text.RegularExpressions.MatchCollection m; //对页面脚本进行初步过滤 PageHTML = Regex.Replace(PageHTML, @"(<mce:script)[^<]*(<//script><!-- )", "", RegexOptions.IgnoreCase); //匹配图片 m = Regex.Matches(PageHTML, @"<img/b[^<>]*?/bsrc[/s/t/r/n]*=[/s/t/r/n]*[""']?[/s/t/r/n]*(?<imgUrl>[^/s/t/r/n""'<>]*)[^<>]*?/?[/s/t/r/n]*>"); for (i = 0; i < m.Count; i++) { string ostr = m[i].ToString(); System.Text.RegularExpressions.MatchCollection m2; //去掉IMAGE 可能存在的ID m2 = Regex.Matches(ostr, "(id=)['|/"].*?['/"]"); int j=0; for (j = 0; j < m2.Count; j++) { oneImage = m2[j].ToString(); PageHTML = PageHTML.Replace(oneImage, ""); } //提取图片的地址 m2 = Regex.Matches(ostr, "(src=)['|/"].*?['/"]"); for (j = 0; j < m2.Count; j++) { oneImage = m2[j].ToString(); oneImage = oneImage.Replace("src=", ""); oneImage = oneImage.Replace("/"", ""); oneImage = oneImage.Replace("'", ""); AllImages = AllImages + oneImage + "<br>"; //如果以http开头,则抓取图片,并直接转换为base64代码 string ImgExt=""; if (oneImage.Trim().Length > 10) { ImgExt = oneImage.Substring(oneImage.Length - 3, 3).ToLower(); } if (oneImage.Trim().Length > 10 && oneImage.Substring(0, 4).ToLower() == "http" && (ImgExt == "jpg" || ImgExt == "png" || ImgExt == "bmp" || ImgExt == "gif")) { Byte[] imgData = wc.DownloadData(oneImage); string s2 = Convert.ToBase64String(imgData); z = z + 1; string imgid = "MY_TEST_AIYA_" + z.ToString(); //给图片一个ID PageHTML = PageHTML.Replace("src=/"" + oneImage, " id=/"A_" + imgid + "/" src=/"" + oneImage); //图片替换掉 PageHTML = PageHTML.Replace(oneImage, ""); //进行页面替换 MHTHeader = MHTHeader + "/n--[MHTML_EMBED_CODE]" + "/n"; MHTHeader = MHTHeader + "Content-Type: image/" +ImgExt + "/n"; MHTHeader = MHTHeader + "Content-Location: B_" + imgid + "/n"; MHTHeader = MHTHeader + "Content-Transfer-Encoding: base64" + "/n"; MHTHeader = MHTHeader + "/n"; MHTHeader = MHTHeader + s2 + "/n"; //var img1=document.getElementById("img1"); //img1.src=getEmbedImageSrc('img1'); MHTJS = MHTJS + "/n var " + imgid + "=document.getElementById(/"A_" + imgid +"/");"; MHTJS = MHTJS + "/n " + imgid + ".src=getEmbedImageSrc(/"B_" + imgid +"/");"; } } } MHTHeader = "<xmp id=/"mthmlCodes/" style="/" mce_style="/""display:none;/">/nContent-Type: multipart/related; boundary=/"[MHTML_EMBED_CODE]/"/n/n" + MHTHeader + "/n--[MHTML_EMBED_CODE]/n</xmp>"; MHTJSCommon = @" <script type=""text/javascript""> var $=function(id){ return document.getElementById(id); } function getEmbedImageSrc(locationCode){ //for ie6、ie7 if(document.all&&navigator.userAgent.indexOf(""MSIE 8.0"")==-1){ return 'mhtml:'+document.location.href+'!'+locationCode; } //for ie8、FireFox 、Opera var codePart=(function(){ var codes=$(""mthmlCodes"") .innerHTML .split(""--[MHTML_EMBED_CODE]""); var pattern=new RegExp( ""Content-Location://s*"" +locationCode +""//s*//n"" ); var code,ret; for(var i=0,len=codes.length;i<len;i++){ code=codes[i]; if(code&&code.match(pattern)){ ret=code; break; } } return ret||""""; })(); var imageType=(function(){ codePart.match(/Content-Type:/s*(.*?)/s*/n/) return RegExp.$1; })(); var imageCode=(function(){ codePart.match(/(/n|/r/n){2,}([/S/s]*?)(/n|/r/n){2,}/); return RegExp.$2; })(); return ""data:""+imageType+"";base64,""+imageCode; alert(imageCode); } // --></mce:script> "; //页面整体HTML代码不变化,在前后各增加一段代码 PageHTML = MHTHeader + "/n" + PageHTML + "/n" + MHTJSCommon + "/n<mce:script type="text/javascript"><!-- " + MHTJS + "/n // --></mce:script>"; //不用非是mht,htm同样可以 FileInfo finfo = new FileInfo("D://" + DateTime.Now.Ticks.ToString() + ".htm"); //以打开或者写入的形式创建文件流 using (FileStream fs = finfo.OpenWrite()) { //根据上面创建的文件流创建写数据流 StreamWriter sw = new StreamWriter(fs, System.Text.Encoding.GetEncoding("GB2312")); //把新的内容写到创建的HTML页面中 sw.Write(PageHTML); sw.Flush(); sw.Close(); } return AllImages; }  

你可能感兴趣的:(html,String,function,脚本,服务器,url)