自己写网页爬虫——网页分类抓取/采集并导入数据库

一直想着整理出网页抓取的具体实现功能代码,方便大家指正,也方便自己学习修正。当然这个并不是针对所有网页,自己写的功能有限,只能针对某一特定结构的网页进行数据采集,如果有更好的方法,请大家不吝指教,在此谢过!

一、抓取网页内容:
网上可以搜索到很多抓取网页的代码,以下这个方法是我搜到的一个供参考:

/// 
/// 获取网页全部源代码
/// 
/// /要访问的网站地址
/// 目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码
/// 
public static string getHtml(string url, params  string[] charSets)
{
    try
    {
        string charSet = null;
        if (charSets.Length == 1)
        {
            charSet = charSets[0];
        }
        WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient
        // 需要注意的:
        //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等
        //这是就要具体问题具体分析比如在头部加入cookie
        // webclient.Headers.Add("Cookie", cookie);
        //这样可能需要一些重载方法.根据需要写就可以了
        //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据.
        myWebClient.Credentials = CredentialCache.DefaultCredentials;
        //如果服务器要验证用户名,密码
        //NetworkCredential mycred = new NetworkCredential(struser, strpassword);
        //myWebClient.Credentials = mycred;
        //从资源下载数据并返回字节数组.(加@是因为网址中间有"/"符号)
        byte[] myDataBuffer = myWebClient.DownloadData(url);
        string strWebData = Encoding.Default.GetString(myDataBuffer);
        //获取网页字符编码描述信息
        Match charSetMatch = Regex.Match(strWebData, ", RegexOptions.IgnoreCase | RegexOptions.Multiline);
        string webCharSet = charSetMatch.Groups[2].Value;
        if (charSet == null || charSet == "")
            charSet = webCharSet;
        if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)
        {
            strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);
        }
        else
        {
            strWebData = Encoding.GetEncoding("utf-8").GetString(myDataBuffer);
        }
        return strWebData;
    }
    catch (Exception e) { return ""; }
}

二、网页代码处理与分类导入数据库
网页处理是采用的正则表达式进行匹配查找,包括html标签的匹配、空格空行的匹配等。

public string GetClasses(string str)
{
    Model.product_category model = new Model.product_category();
    BLL.product_category bll = new BLL.product_category();
    string classname = "";

    string pattern1 = "

(.*?)

"
; string pattern2 = "(
[^]*)(.*?)
"
; string pattern21 = "(.*?)"; string pattern3 = "(
  • [^]*)(.*?)
  • "
    ; foreach(char b in str) { var m = Regex.Match(str, pattern1, RegexOptions.Singleline | RegexOptions.IgnoreCase); string bs1 = m.Groups[1].Value; //一级类 str = str.Substring(str.IndexOf("") + 17); classname = "★一级类:" + bs1; model = new Model.product_category(); model.title = bs1; model.call_index = ""; model.parent_id = 0; model.class_layer = 1; model.sort_id = 99; int cateId = bll.Add(model); if(cateId>0) { model.class_list = "," + cateId + ","; bll.Update(model); foreach (char a in str) { var m2 = Regex.Match(str, pattern2, RegexOptions.Singleline | RegexOptions.IgnoreCase); if (m2.Length > 0) { string aa = m2.ToString(); m = Regex.Match(aa, pattern21, RegexOptions.Singleline | RegexOptions.IgnoreCase); string bs2 = m.Groups[1].Value; //二级类 Regex rgx2 = new Regex("(
    [^]*)(.*?)" + bs2 + "
    "
    ); str = rgx2.Replace(str, ""); classname += "◆二级类:" + bs2 + "▲三级类:["; model = new Model.product_category(); model.title = bs2; model.call_index = ""; model.parent_id = cateId; model.class_layer = 2; model.sort_id = 99; int catexId = bll.Add(model); if(catexId>0) { model.class_list = "," + cateId + "," + catexId + ","; bll.Update(model); foreach (char c in str) { var m3 = Regex.Match(str, pattern3, RegexOptions.Singleline | RegexOptions.IgnoreCase); if (m3.Length > 0) { string str3 = m3.ToString(); m = Regex.Match(str3, pattern21, RegexOptions.Singleline | RegexOptions.IgnoreCase); string bs3 = m.Groups[1].Value; //三级类 model = new Model.product_category(); model.title = bs3; model.call_index = ""; model.parent_id = catexId; model.class_list = ""; model.class_layer = 3; model.sort_id = 99; int catexxId = bll.Add(model); if(catexxId>0) { model.class_list = "," + cateId + "," + catexId + "," + catexxId + ","; bll.Update(model); } classname += "|" + bs3; Regex rgx = new Regex("(
  • [^]*)(.*?)" + bs3 + "
  • "
    ); str = rgx.Replace(str, ""); } str = new Regex("(\r\n)*[(\\t)*]").Replace(str, ""); int index_0 = str.IndexOf("
    "
    ); if (index_0 == 74 || index_0 == 29) { str = str.Substring(index_0 + 34); break; //跳出本层循环(最内层循环结束) } } } classname += "]"; } } } int index_1 = str.IndexOf("
    "); if (index_1 < 45) { break; //跳出本层循环(内层循环结束) } } return classname; }

    三、触发采集
    前台代码:

    输入网址:<asp:TextBox ID="TextBox1" runat="server" Width="400">http://www.tansoole.com/asp:TextBox>
    <asp:Button ID="Button2" runat="server" Text="提交" onclick="Button2_Click" /><br /><br />
    <asp:TextBox ID="TextBox2" runat="server" TextMode="MultiLine" Rows="30" Columns="100">asp:TextBox>

    后台事件:

    protected void Button2_Click(object sender, EventArgs e)
    {
        if (this.TextBox1.Text.Length > 0)
        {
            string str = getHtml(this.TextBox1.Text, "utf-8");
            str = str.Substring(str.IndexOf("sidebar")+10);
            this.TextBox2.Text = GetClasses(str);
        }
    }

    本文只能为做网页采集提供一个思路,具体实现还需根据实际情况进行验证修改。

    PS:涉及的正则内容
    //s 匹配空格(带转义字符)
    (\r\n)* 匹配空行及换行
    (\t)* 匹配制表符

    你可能感兴趣的:(c#)