抓取源码爱好者所有网页特效例子并保存到本地

提取http://www.codefans.net/jscss/code/1866.shtm等l类似网页中运行区块的html代码 并保存到本地.

应该是将 源码爱好者 » 网页特效代码 下面的子目录都抓下来了, 我机器上运行了15min抓了有10几个子目录  共4M多

用到了正则表达式,文件读取保存, 多线程

 是用vs2005写的, winform

form1.cs

代码
/* *************************************************************************************************************
 * 本程序多线程从特定网页中提取一块内容
 * 具体从http://www.codefans.net/jscss/code/1866.shtml提取中间演示textarea内的html文本
 * 从(网页特效代码)->(详细分类)->html网页中textarea内容
 * 
 * 程序内使用了1.多线程 2.正则表达式 3.web文件读取 4.本地文件保存及编码问题
 * 
 * 
 * 
 * 线程挂起没实现 好像用ThreadPool可以暂停纯种线程
 * 
 * 
 * 
 * 
 * 
 **************************************************************************************************************
*/
using  System;
using  System.Collections.Generic;
using  System.ComponentModel;
using  System.Data;
using  System.Drawing;
using  System.Text;
using  System.Windows.Forms;
using  System.Net;
using  System.IO;
using  System.Text.RegularExpressions;
using  System.Threading;

namespace  dig
{
    
public   partial   class  form1 : Form
    {
        
private  Thread getFileThread  =   null ;
        
private  DateTime startTime  =  DateTime.Now;
        
private   string  strCurUrl  =   "" ; // 当前处理的url
         private   string  strSaveUrl  =   "" ; // 当前保存网页的url
         public  form1()
        {
            InitializeComponent();
        }
        
        
//   http://www.codefans.net/jscss/code/1866.shtml
         private   void  btnStart_Click( object  sender, EventArgs e)
        {
            
if  (getFileThread  ==   null )
            {
                getFileThread 
=   new  Thread( new  ThreadStart(GetFileAndSave)); // 新建一个线程
                getFileThread.Start(); // 线程开始
            }
        }

        
private   void  GetFileAndSave()
        {
            
for  ( int  i  =   0 ; i  <   9999 ++ i)
            {
                
/* string strI = i + "";
                while (strI.Length < 4)
                {
                    strI = "0" + strI;
                }
*/
                CreateHtmlPage(
@" http://www.codefans.net/jscss/code/ "   +  i  +   " .shtml " );
                
// Console.WriteLine(strI);

            }
        }

        
private   void  CreateHtmlPage( string  strUrl)
        {
            
try
            {
                
// 正在处理的url
                strCurUrl  =  strUrl;

                
// 读取文件
                HttpWebRequest myReq  =  (HttpWebRequest)WebRequest.Create(strUrl);
                HttpWebResponse myResp 
=  (HttpWebResponse)myReq.GetResponse();
                StreamReader respStream 
=   new  StreamReader(myResp.GetResponseStream(), Encoding.Default);
                
string  respStr  =  respStream.ReadToEnd();
                respStream.Close();

                
// 得到文件名 以文件标题为文件名
                 string  strReg  =   @" (?<=(<title>)).*(?=_源码爱好者</title>) " ;
                
string  strFileName  =   new  Regex(strReg).Match(respStr).ToString();

                
// 得到文件夹名 从"网页特效代码"后面取100个字符分析
                 int  iTemp  =  respStr.IndexOf( " 网页特效代码 " );
                
string  strFloderName  =  respStr.Substring(iTemp,  100 );
                strReg 
=   @" (?<=(<a.*>)).*(?=</a>) " ;
                strFloderName 
=   new  Regex(strReg, RegexOptions.IgnoreCase).Match(strFloderName).ToString();

                
// 取出<textarea></textarea>之间的字符
                strReg  =   @" (?<=(<textarea.*?>))([\w\W]*)(?=</textarea>) " ; // ?<=表示左环视 不包()里面东东  .表示任何字符除了/n  *?表示尽可能少的(好像是lazy) ?=右环视 不包括内容
                Match match  =   new  Regex(strReg).Match(respStr);
                
// 将&quot;替换成"
                strReg  =   @" &quot; " ;
                respStr 
=   new  Regex(strReg).Replace(match.ToString(),  " \ "" );
                 // 将&gt;替换成>
                 /*
                strReg = @"&gt;";
                respStr = new Regex(strReg).Replace(match.ToString(), ">");
                //将&lt;替换成<
                strReg = @"&lt;";
                respStr = new Regex(strReg).Replace(match.ToString(), "<");
                
*/
                respStr 
=  respStr.Replace( " &quot; " " \ "" );
                respStr  =  respStr.Replace( " &lt; " " < " );
                respStr 
=  respStr.Replace( " &gt; " " > " );
                
// 写入文件
                 string  path  =  SaveFile(respStr, strFileName, strFloderName);
                
// 处理完的url
                strSaveUrl  =  path  +   " \\ "   +  strFileName  +   " .html " ;
            }
            
catch
            {
                StreamWriter sw 
=   new  StreamWriter( @" c:\error.txt " true , System.Text.Encoding.GetEncoding( " gb2312 " )); // 将不能读取的文件url写进txt文档
                sw.Write(strUrl);
                sw.Flush();
                sw.Close();
                
this .strSaveUrl  =   " 读取远程url失败, 未能保存 " ;
            }
        }

        
private   static   string  SaveFile( string  str,  string  strFileName,  string  strFloderName)
        {
            
string  path  =   @" c:\ "   +   @" 网页特效代码\ "   +  strFloderName;
            
if  ( ! Directory.Exists(path))
                Directory.CreateDirectory(path);
            StreamWriter sw 
=   new  StreamWriter(path  +   " \\ "   +  strFileName  +   " .html " true , System.Text.Encoding.GetEncoding( " gb2312 " )); // System.Text.Encoding.Default;
            sw.Write(str);
            sw.Flush();
            sw.Close();
            
return  path;
            
/* TextWriter myWriter = File.CreateText(path + "\\" + strFileName + ".html");//file只能以utf-8写入
                myWriter.Write(respStr);
                myWriter.Flush();
                myWriter.Close();
*/
        }

        
private   void  timer1_Tick( object  sender, EventArgs e)
        {
            
this .lblTime.Text  =  DateTime.Now.ToLocalTime().ToString();
            TimeSpan span 
=  DateTime.Now.Subtract(startTime);
            
this .lblTimeElapsed.Text  =  span.Seconds.ToString();
            
this .txtUrl.Text  =  strCurUrl;
            
this .txtSaveUrl.Text  =  strSaveUrl;
        }

        
private   void  btnStop_Click( object  sender, EventArgs e)
        {
            getFileThread.Abort();
// 结束线程
        }

        
private   void  btnPause_Click( object  sender, EventArgs e)
        {
            
// getFileThread.Suspend(); // 线程挂起
        }
        
private   void  form1_FormClosed( object  sender, FormClosedEventArgs e)
        {
            
if (getFileThread  !=   null )
                getFileThread.Abort();
        }
    }
}

form1.desginer.cs
代码
namespace  dig
{
    
partial   class  form1
    {
        
///   <summary>
        
///  必需的设计器变量。
        
///   </summary>
         private  System.ComponentModel.IContainer components  =   null ;

        
///   <summary>
        
///  清理所有正在使用的资源。
        
///   </summary>
        
///   <param name="disposing"> 如果应释放托管资源,为 true;否则为 false。 </param>
         protected   override   void  Dispose( bool  disposing)
        {
            
if  (disposing  &&  (components  !=   null ))
            {
                components.Dispose();
            }
            
base .Dispose(disposing);
        }

        
#region  Windows 窗体设计器生成的代码

        
///   <summary>
        
///  设计器支持所需的方法 - 不要
        
///  使用代码编辑器修改此方法的内容。
        
///   </summary>
         private   void  InitializeComponent()
        {
            
this .components  =   new  System.ComponentModel.Container();
            
this .btnStart  =   new  System.Windows.Forms.Button();
            
this .txtUrl  =   new  System.Windows.Forms.TextBox();
            
this .label1  =   new  System.Windows.Forms.Label();
            
this .label2  =   new  System.Windows.Forms.Label();
            
this .txtSaveUrl  =   new  System.Windows.Forms.TextBox();
            
this .timer1  =   new  System.Windows.Forms.Timer( this .components);
            
this .label3  =   new  System.Windows.Forms.Label();
            
this .lblTime  =   new  System.Windows.Forms.Label();
            
this .label4  =   new  System.Windows.Forms.Label();
            
this .lblTimeElapsed  =   new  System.Windows.Forms.Label();
            
this .btnStop  =   new  System.Windows.Forms.Button();
            
this .btnPause  =   new  System.Windows.Forms.Button();
            
this .SuspendLayout();
            
//  
            
//  btnStart
            
//  
             this .btnStart.Location  =   new  System.Drawing.Point( 97 263 );
            
this .btnStart.Name  =   " btnStart " ;
            
this .btnStart.Size  =   new  System.Drawing.Size( 75 23 );
            
this .btnStart.TabIndex  =   0 ;
            
this .btnStart.Text  =   " &Start " ;
            
this .btnStart.UseVisualStyleBackColor  =   true ;
            
this .btnStart.Click  +=   new  System.EventHandler( this .btnStart_Click);
            
//  
            
//  txtUrl
            
//  
             this .txtUrl.Location  =   new  System.Drawing.Point( 95 20 );
            
this .txtUrl.Name  =   " txtUrl " ;
            
this .txtUrl.ReadOnly  =   true ;
            
this .txtUrl.Size  =   new  System.Drawing.Size( 326 21 );
            
this .txtUrl.TabIndex  =   1 ;
            
this .txtUrl.Text  =   " http://www.codefans.net/jscss/code/1866.shtml " ;
            
//  
            
//  label1
            
//  
             this .label1.AutoSize  =   true ;
            
this .label1.Location  =   new  System.Drawing.Point( 30 23 );
            
this .label1.Name  =   " label1 " ;
            
this .label1.Size  =   new  System.Drawing.Size( 59 12 );
            
this .label1.TabIndex  =   2 ;
            
this .label1.Text  =   " 远程文件: " ;
            
//  
            
//  label2
            
//  
             this .label2.AutoSize  =   true ;
            
this .label2.Location  =   new  System.Drawing.Point( 30 82 );
            
this .label2.Name  =   " label2 " ;
            
this .label2.Size  =   new  System.Drawing.Size( 59 12 );
            
this .label2.TabIndex  =   3 ;
            
this .label2.Text  =   " 保存位置: " ;
            
//  
            
//  txtSaveUrl
            
//  
             this .txtSaveUrl.Location  =   new  System.Drawing.Point( 97 79 );
            
this .txtSaveUrl.Name  =   " txtSaveUrl " ;
            
this .txtSaveUrl.ReadOnly  =   true ;
            
this .txtSaveUrl.Size  =   new  System.Drawing.Size( 326 21 );
            
this .txtSaveUrl.TabIndex  =   1 ;
            
this .txtSaveUrl.Text  =   " http://www.codefans.net/jscss/code/1866.shtml " ;
            
//  
            
//  timer1
            
//  
             this .timer1.Enabled  =   true ;
            
this .timer1.Interval  =   1000 ;
            
this .timer1.Tick  +=   new  System.EventHandler( this .timer1_Tick);
            
//  
            
//  label3
            
//  
             this .label3.AutoSize  =   true ;
            
this .label3.Location  =   new  System.Drawing.Point( 30 141 );
            
this .label3.Name  =   " label3 " ;
            
this .label3.Size  =   new  System.Drawing.Size( 59 12 );
            
this .label3.TabIndex  =   4 ;
            
this .label3.Text  =   " 当前时间: " ;
            
//  
            
//  lblTime
            
//  
             this .lblTime.AutoSize  =   true ;
            
this .lblTime.Location  =   new  System.Drawing.Point( 95 141 );
            
this .lblTime.Name  =   " lblTime " ;
            
this .lblTime.Size  =   new  System.Drawing.Size( 35 12 );
            
this .lblTime.TabIndex  =   5 ;
            
this .lblTime.Text  =   " 12:00 " ;
            
//  
            
//  label4
            
//  
             this .label4.AutoSize  =   true ;
            
this .label4.Location  =   new  System.Drawing.Point( 30 200 );
            
this .label4.Name  =   " label4 " ;
            
this .label4.Size  =   new  System.Drawing.Size( 59 12 );
            
this .label4.TabIndex  =   4 ;
            
this .label4.Text  =   " 共用时间: " ;
            
//  
            
//  lblTimeElapsed
            
//  
             this .lblTimeElapsed.AutoSize  =   true ;
            
this .lblTimeElapsed.Location  =   new  System.Drawing.Point( 95 201 );
            
this .lblTimeElapsed.Name  =   " lblTimeElapsed " ;
            
this .lblTimeElapsed.Size  =   new  System.Drawing.Size( 11 12 );
            
this .lblTimeElapsed.TabIndex  =   5 ;
            
this .lblTimeElapsed.Text  =   " 0 " ;
            
//  
            
//  btnStop
            
//  
             this .btnStop.Location  =   new  System.Drawing.Point( 290 263 );
            
this .btnStop.Name  =   " btnStop " ;
            
this .btnStop.Size  =   new  System.Drawing.Size( 75 23 );
            
this .btnStop.TabIndex  =   0 ;
            
this .btnStop.Text  =   " S&top " ;
            
this .btnStop.UseVisualStyleBackColor  =   true ;
            
this .btnStop.Click  +=   new  System.EventHandler( this .btnStop_Click);
            
//  
            
//  btnPause
            
//  
             this .btnPause.Location  =   new  System.Drawing.Point( 194 263 );
            
this .btnPause.Name  =   " btnPause " ;
            
this .btnPause.Size  =   new  System.Drawing.Size( 75 23 );
            
this .btnPause.TabIndex  =   0 ;
            
this .btnPause.Text  =   " &Pause " ;
            
this .btnPause.UseVisualStyleBackColor  =   true ;
            
this .btnPause.Click  +=   new  System.EventHandler( this .btnPause_Click);
            
//  
            
//  form1
            
//  
             this .AutoScaleDimensions  =   new  System.Drawing.SizeF(6F, 12F);
            
this .AutoScaleMode  =  System.Windows.Forms.AutoScaleMode.Font;
            
this .ClientSize  =   new  System.Drawing.Size( 442 313 );
            
this .Controls.Add( this .lblTimeElapsed);
            
this .Controls.Add( this .lblTime);
            
this .Controls.Add( this .label4);
            
this .Controls.Add( this .label3);
            
this .Controls.Add( this .label2);
            
this .Controls.Add( this .label1);
            
this .Controls.Add( this .txtSaveUrl);
            
this .Controls.Add( this .txtUrl);
            
this .Controls.Add( this .btnStop);
            
this .Controls.Add( this .btnPause);
            
this .Controls.Add( this .btnStart);
            
this .Name  =   " form1 " ;
            
this .Text  =   " 提取网页 " ;
            
this .FormClosed  +=   new  System.Windows.Forms.FormClosedEventHandler( this .form1_FormClosed);
            
this .ResumeLayout( false );
            
this .PerformLayout();

        }

        
#endregion

        
private  System.Windows.Forms.Button btnStart;
        
private  System.Windows.Forms.TextBox txtUrl;
        
private  System.Windows.Forms.Label label1;
        
private  System.Windows.Forms.Label label2;
        
private  System.Windows.Forms.TextBox txtSaveUrl;
        
private  System.Windows.Forms.Timer timer1;
        
private  System.Windows.Forms.Label label3;
        
private  System.Windows.Forms.Label lblTime;
        
private  System.Windows.Forms.Label label4;
        
private  System.Windows.Forms.Label lblTimeElapsed;
        
private  System.Windows.Forms.Button btnStop;
        
private  System.Windows.Forms.Button btnPause;
    }
}


 源码下载

你可能感兴趣的:(源码)