C# 爬虫学习之猫眼电影

C# 爬虫学习之猫眼电影(完整代码见最后)

1、HTTP部分

1.1 引用

using System;
using System.Net;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Configuration;
using System.Diagnostics;
using System.IO;

1.2 Get方法

public class HTTP
	{	
		public static string GET(string url, string cookies = null, int timeout = 5000)
		{
			HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
			request.Method = "GET";
			request.ContentType = "text/html;charset=UTF-8";
			request.UserAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36";
			request.Timeout = timeout;

			if (cookies != null) 
			{
				request.Headers.Add("Cookie", cookies);
			}

			HttpWebResponse response = (HttpWebResponse)request.GetResponse();
			Stream myResponseStream = response.GetResponseStream();
			StreamReader myStreamReader = new StreamReader(myResponseStream, Encoding.GetEncoding("utf-8"));
			string retString = myStreamReader.ReadToEnd();
			myStreamReader.Close();
			myResponseStream.Close();

			return retString;
		}
	}

2、数据抓取部分

2.1 步骤

  • 寻找如何提交所属城市信息
页面cookies中有个ci=1属性,值为城市ID,ci=1即为北京。把ci属性通过cookies提交上去即可设置城市
  • 发送Get请求
string result = HTTP.GET("http://maoyan.com/cinemas?offset=0",this.ci);
result = result.Replace("\n", "").Replace(" ","").Replace("\t","").Replace("\r","");
  • 获取到页面HTML数据后通过正则取出所需到数据,C#使用正则需要添加引用
using System.Text.RegularExpressions;
  • 取城市名
if (this.cityName == null) 
{
	Match cityName = Regex.Match(this.html,"([\\S]+?)");
	this.cityName = cityName.Groups [1].Value;
}
  • 取出影院名和地址
// 开始取影院信息 [3]=名字 	[5]=地址
MatchCollection s = Regex.Matches(this.html,"([\\S]+?)).*?(地址:([\\S]+?))

.*?
"); for (int i = 0; i < s.Count; i++) { this.cinemaCount++; Console.WriteLine (String.Format ("------第{0}家影院------", this.cinemaCount)); Console.WriteLine (String.Format ("名字:{0}", s[i].Groups[3].Value)); Console.WriteLine (String.Format ("地址:{0}", s[i].Groups[5].Value)); }
  • 取页面数
// 取页数并输出数据
MatchCollection pageNumber = Regex.Matches(this.html,"offset=\\d+\">(\\d+)");

if (pageNumber.Count > 1) 
{
	this.pageCount = Int32.Parse (pageNumber [pageNumber.Count - 1].Groups [pageNumber [pageNumber.Count - 1].Groups.Count - 1].Value);

	for (int m = 1; m <= this.pageCount - 1; m++) 
	{
		string pageUrl = String.Format ("http://maoyan.com/cinemas?offset={0}", m * 12);

		string pageData = HTTP.GET (pageUrl, this.ci);
		pageData = pageData.Replace ("\n", "").Replace (" ", "").Replace ("\t", "").Replace ("\r", "");

		this.html = pageData;
		this.getCinemaData ();
	}
}

2.2 结语

至此就可以把猫眼电影单个城市所有电影院名和地址全部取出了

完整代码

using System;
using System.Text.RegularExpressions;

namespace maoyan
{
	public class Cinema
	{	
		private bool isReady = false;
		private bool isStart = false;
		public string cityName = null;
		private string html;
		public int cinemaCount = 0;
		public int pageCount = 0;
		public string ci;

		public void Ready (string ci)
		{	
			if (this.isReady)
				return;

			this.html = "";
			this.cinemaCount = 0;
			this.cityName = null;
			this.pageCount = 0;


			// 当前城市第一页开始
			string result = HTTP.GET("http://maoyan.com/cinemas?offset=0",this.ci);
			result = result.Replace("\n", "").Replace(" ","").Replace("\t","").Replace("\r","");

			this.ci = ci;
			this.html = result;
			this.isReady = true;
		}

		public void Start ()
		{
			if (this.html.Length <= 0 || this.isStart)
				return;
			this.isStart = true;

			// 取城市名
			this.getCityName ();
			Console.WriteLine ("当前城市:{0}",this.cityName);
			// 取页面数据
			this.getCinemaData ();
			// 取页面数
			//this.getPageCount ();
			this.isReady = false;
			this.isStart = false;
		}

		public void getCityName ()
		{
			if (this.cityName == null) {
				Match cityName = Regex.Match(this.html,"([\\S]+?)");
				this.cityName = cityName.Groups [1].Value;
			}
		}

		public void getCinemaData ()
		{
			// 开始取影院信息 [3]=名字 	[5]=地址
			MatchCollection s = Regex.Matches(this.html,"([\\S]+?)).*?(地址:([\\S]+?))

.*?
"); for (int i = 0; i < s.Count; i++) { this.cinemaCount++; Console.WriteLine (String.Format ("------第{0}家影院------", this.cinemaCount)); Console.WriteLine (String.Format ("名字:{0}", s[i].Groups[3].Value)); Console.WriteLine (String.Format ("地址:{0}", s[i].Groups[5].Value)); } } public void getPageCount () { // 取页数 MatchCollection pageNumber = Regex.Matches(this.html,"offset=\\d+\">(\\d+)"); if (pageNumber.Count > 1) { this.pageCount = Int32.Parse (pageNumber [pageNumber.Count - 1].Groups [pageNumber [pageNumber.Count - 1].Groups.Count - 1].Value); for (int m = 1; m <= this.pageCount - 1; m++) { string pageUrl = String.Format ("http://maoyan.com/cinemas?offset={0}", m * 12); string pageData = HTTP.GET (pageUrl, this.ci); pageData = pageData.Replace ("\n", "").Replace (" ", "").Replace ("\t", "").Replace ("\r", ""); this.html = pageData; this.getCinemaData (); } } } } }

Main函数

using System;

namespace maoyan
{
	class MainClass
	{	
		public static void Main (string[] args)
		{
			Console.WriteLine ("------------开始------------");

			Cinema mc = new Cinema ();
			string ci = "ci=1";
			mc.Ready (ci);
			mc.Start ();

			Console.WriteLine ("------------结束------------");
		}
	}
}

你可能感兴趣的:(C#)