c# 连续抓取页面内容

实现功能:去一个url抓取页面,在页面的内容里面在去找另一个url。找到这个这url之后经过一系列操作后再去重组的url去抓取内容。

第一、写出c#抓取页面的代码

c#抓取页面
  1 using System;

  2 using System.Collections.Generic;

  3 using System.Linq;

  4 using System.Web;

  5 using System.IO;

  6 using System.Net;

  7 using System.Text;

  8 

  9 /// <summary>

 10 ///abc 的摘要说明

 11 /// </summary>

 12 public static class abc

 13 {

 14     /// <summary>

 15     /// webRequest 模拟http get请求

 16     /// </summary>

 17     /// <param name="strUrl">请求的url</param>

 18     /// <param name="encoding">编码</param>

 19     /// <returns>返回字符串</returns>

 20     public static string GetHttpResponse(this string strUrl, Encoding encoding)

 21     {

 22         string strResult = string.Empty;

 23         try

 24         {

 25             HttpWebRequest myReq = (HttpWebRequest)HttpWebRequest.Create(strUrl);

 26             HttpWebResponse HttpWResp = (HttpWebResponse)myReq.GetResponse();

 27             Stream myStream = HttpWResp.GetResponseStream();

 28             StreamReader sr = new StreamReader(myStream, encoding);

 29             strResult = sr.ReadToEnd();

 30 

 31         }

 32         catch (Exception ex)

 33         {

 34             WriteLog(ex.Message, strUrl);

 35         }

 36 

 37         return strResult;

 38     }

 39 

 40     /// <summary>

 41     /// webRequest 模拟http post请求

 42     /// </summary>

 43     /// <param name="url">请求的url</param>

 44     /// <param name="val">post 的数据</param>

 45     /// <returns>返回字符串</returns>

 46     public static string GetHttpPostResponse(this string url, string val, Encoding encoding)

 47     {

 48         string strResult = string.Empty;

 49         try

 50         {

 51             HttpWebRequest myReq = (HttpWebRequest)HttpWebRequest.Create(url);

 52             myReq.Method = "Post";

 53             myReq.ContentType = "application/x-www-form-urlencoded";

 54             byte[] byteArray = encoding.GetBytes(val);

 55             myReq.ContentLength = byteArray.Length;

 56             Stream stream = myReq.GetRequestStream();

 57             stream.Write(byteArray, 0, byteArray.Length);

 58             stream.Close();

 59             HttpWebResponse HttpWResp = (HttpWebResponse)myReq.GetResponse();

 60             Stream myStream = HttpWResp.GetResponseStream();

 61             StreamReader sr = new StreamReader(myStream, encoding);

 62             strResult = sr.ReadToEnd();

 63 

 64         }

 65         catch (Exception ex)

 66         {

 67             WriteLog(ex.Message + val, url);

 68         }

 69 

 70         return strResult;

 71     }

 72 

 73     public static void WriteLog(string sLog, string titleLog)

 74     {

 75         try

 76         {

 77             string logPath = System.AppDomain.CurrentDomain.BaseDirectory;//目录位置

 78 

 79             DateTime dt = DateTime.Now;

 80             string logfile = new StringBuilder(logPath).Append("\\Log\\").Append(dt.ToString("yyyy-MM-dd")).Append("\\").Append(titleLog).Append("_").Append(dt.ToString("yyyyMMddHHmmss")).Append(".txt").ToString();

 81             if (!System.IO.Directory.Exists(System.IO.Path.GetDirectoryName(logfile)))

 82             {

 83                 System.IO.Directory.CreateDirectory(System.IO.Path.GetDirectoryName(logfile));

 84             }

 85             if (!File.Exists(logfile))

 86             {

 87                 FileStream fs = System.IO.File.Create(logfile);

 88                 fs.Close();

 89             }

 90             using (StreamWriter sw = new StreamWriter(logfile, true))

 91             {

 92                 sw.WriteLine(DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss") + ":");

 93                 sw.WriteLine(sLog);

 94                 sw.WriteLine();

 95                 sw.Close();

 96             }

 97         }

 98         catch

 99         {

100 

101         }

102     }

103 }


第二、调用里面的方法GetHttpResponse去抓取页面(注这是get方式,如果是post方式可以选择post方式)

第三、用正则匹配方式得到想要的URL(Match mc = Regex.Match(aa, "action=(.*)>", RegexOptions.IgnoreCase);)

第四、由于此时得到URL是经过浏览器处理的URL如果我们直接去抓取页面就会找不到页面。因为此时的URL的协议是Https协议。所以我们需要中间做一次跳转。

首先还是去抓aa = abc.GetHttpResponse(str, Encoding.UTF8);得到的URL里有一个将要跳转到的url目录。我们需要将主域名+刚刚得到的这个目录。

第五、然后再去抓取。就可以得到我们想要的内容!

你可能感兴趣的:(C#)