利用HttpWebRequest自动抓取

1、利用httpwatch找到网站入口及参数:(详情略,请百度httpwatch的使用方法)

2、利用httpwebRequest将51拉查看密码Post到网站入口,登陆成功后再进入你想要进入的页面抓取页面 

<!--<br /><br />Code highlighting produced by Actipro CodeHighlighter (freeware)<br />http://www.CodeHighlighter.com/<br /><br />-->          ///   <summary>
        
///  某日或者叫时间段,51la里组员ID里——搜索引擎内——的数据。
        
///   </summary>
        
///   <param name="strStaticId"></param>
        
///   <param name="LookPass"></param>
        
///   <param name="strBeginDate"></param>
        
///   <param name="strEndDate"></param>
        
///   <returns></returns>
         public   string  FiveLaSeo( string  strStaticId,  string  LookPass, string  strBeginDate, string  strEndDate)
        {
            CookieContainer Cc 
=   new  CookieContainer();

            ASCIIEncoding encoding 
=   new  ASCIIEncoding();
            
string  postData  =   " id= "   +  strStaticId;
            postData 
+=  ( " &lookpass= "   +  LookPass);
            postData 
+=   " &t=chalogin " ;

            
// 将提交的字符串数据转换成字节数组
             byte [] data  =  encoding.GetBytes(postData);

            
//  设置提交的相关参数   
            HttpWebRequest myRequest  =  (HttpWebRequest)WebRequest.Create( " http://www.51.la/report/0_help.asp " );
            myRequest.Method 
=   " POST " ;
            myRequest.ContentType 
=   " application/x-www-form-urlencoded " ;
            myRequest.ContentLength 
=  data.Length;
            
// cookie的容器一定要加
            myRequest.CookieContainer  =  Cc;

            
//  提交请求数据  
            Stream newStream  =  myRequest.GetRequestStream();            
            newStream.Write(data, 
0 , data.Length);
            newStream.Close();

            
//  接收返回的页面  
            HttpWebResponse myResponse  =  (HttpWebResponse)myRequest.GetResponse();
            StreamReader reader 
=   new  StreamReader(myResponse.GetResponseStream(), Encoding.Default);
            
string  content  =  reader.ReadToEnd();

            
// 进去后打开特定页面的参数设置
            myRequest  =  (HttpWebRequest)WebRequest.Create( " http://www.51.la/report/3_SE.asp?id= "   +  strStaticId  +   " &d1= "   +  strBeginDate  +   " &d2= "   +  strEndDate);
            myRequest.Method 
=   " GET " ;
            myRequest.KeepAlive 
=   false ;
            myRequest.CookieContainer 
=  Cc;

            
// 接收返回的特定页面
            myResponse  =  (HttpWebResponse)myRequest.GetResponse();
            newStream 
=  myResponse.GetResponseStream();
            reader 
=   new  StreamReader(myResponse.GetResponseStream(),Encoding.Default);
            content 
=  reader.ReadToEnd();
            
return  content;
        }

 

3、将抓取的页面进行正则表达式匹配,取出自己所需要的数据(这里我需要搜索引擎流量) 

<!--<br /><br />Code highlighting produced by Actipro CodeHighlighter (freeware)<br />http://www.CodeHighlighter.com/<br /><br />-->          ///   <summary>
        
///  返回51拉中:搜索引擎的访问量IP-*?表示匹配最少的重复项目
        
///   </summary>
        
///   <param name="strStaticId"></param>
        
///   <param name="LookPass"></param>
        
///   <param name="strBeginDate"></param>
        
///   <param name="strEndDate"></param>
        
///   <returns></returns>
         public   string  FivelaSeoPv( string  strStaticId,  string  LookPass, string  strBeginDate, string  strEndDate)
        {
            Tool.FiveLa fl 
=   new  FiveLa();
            
string  html = fl.FiveLaSeo(strStaticId, LookPass, strBeginDate, strEndDate);
            
string  pattern  =   @" 来自搜索引擎的访问量 \( [\s\S]*? IP \) 占总访问量 " ;
            
// string pattern = @"占总访问量";
             string  number  =  Regex.Match(html, pattern, RegexOptions.IgnoreCase).Value;
            number 
=  number.Replace( " 来自搜索引擎的访问量 ( " "" ).Replace( "  IP ) " "" ).Replace( " 占总访问量 " , "" ).Trim(); ;
            
if  (number  ==   "" )
            { number 
=   " 0 " ; }
            
return  number;
        }

 

你可能感兴趣的:(html,搜索引擎,正则表达式,百度,asp)