去年早些时候的在博客中分享过一篇文章 通过Socket进行HttP/HTTPS网页操作 ,之后在另外一文章 Socket模拟HTTP协议之火车票购票软件 中用到了该帮助类.
时隔一年之后,在偶尔的使用过程中我将该类重新修复修改了一下:
1、增加了对chunked协议的解析
2、修改了一些API接口
3、修复了一些BUG
就在最近也还有同学找到我请教该类的一些用法,所以再次整理了一下发布出来,希望对需要的朋友有所帮助。
不过,目前该类的一些封装和API接口还不是特别友好,有需要的朋友可自行理解后修改,我也很善于帮不懂的朋友解答一些问题。
如若以后再有新的修改我也会同样共享出来。
需要Socket方式提交HTTP的同学请无视HttpWebRequest方式的部分;
需要返回编码的同学我想说的是,许多在做采集的同学都希望能智能的识别返回内容的编码,我个人觉着没多大必要,
因为大多时候大家要采集的目标是确定的,那其内容编码也是确定的,通过一些硬编码或者配置文件可解决的问题没必要陷入一个误区,一定做智能编码识别反倒吃力不讨好。
using System; using System.Collections; using System.Collections.Generic; using System.Diagnostics; using System.IO; using System.Net; using System.Net.Security; using System.Net.Sockets; using System.Security.Authentication; using System.Security.Cryptography.X509Certificates; using System.Text; using System.Text.RegularExpressions; using System.Threading; using System.Threading.Tasks; /************************************************************************/ /* Author:huliang * Email:[email protected] * 注意:转载请注明出处 /************************************************************************/ namespace LiangHu { /// <summary> /// HTTP协议头包装 /// </summary> public class HttpHeader { public HttpHeader() : this("") { } public HttpHeader(string url) { this.Url = url; } public string Url { get; set; } public string Host { get; set; } public string Accept { get; set; } public string Referer { get; set; } public string Cookies { get; set; } public string Body { get; set; } Dictionary<string, string> m_Others = new Dictionary<string, string>(); public string this[string key] { get { return m_Others.ContainsKey(key) ? m_Others.ContainsKey(key) : null; } set { Add(key, value); } } public void Add(string key, string value) { switch (key.ToUpper()) { case "URL": this.Url = value; break; case "HOST": this.Host = value; break; case "ACCEPT": this.Accept = value; break; case "REFERER": this.Referer = value; break; case "BODY": this.Body = value; break; default: if (!m_Others.ContainsKey(key)) { m_Others.Add(key, value); } else { m_Others[key] = value; } break; } } } /// <summary> /// HTTP回应包装 /// </summary> public class HttpResponse { internal HttpResponse(string header, byte[] body) { this.Header = header; this.Body = body; } //暂未将回应HTTP协议头转换为HttpHeader类型 public string Header { get; private set; } public byte[] Body { get; private set; } } /// <summary> /// HttpHelper /// </summary> public static class HttpHelper { /// <summary> /// 提交方法 /// </summary> enum HttpMethod { GET, POST } #region HttpWebRequest & HttpWebResponse /// <summary> /// Get方法 /// </summary> /// <param name="url">请求地址</param> /// <param name="cookieContainer">Cookies存储器</param> /// <param name="encoding">返回内容的编码格式</param> /// <param name="others">其他需要补充的HTTP协议头</param> /// <returns>请求结果</returns> public static string Get(string url, CookieContainer cookieContainer, Encoding encoding, Dictionary<string, string> others = null) { return InternalHttp(HttpMethod.GET, url, null, cookieContainer, encoding, others); } /// <summary> /// Post方法 /// </summary> /// <param name="url">请求地址</param> /// <param name="body">Post内容</param> /// <param name="cookieContainer">Cookies存储器</param> /// <param name="encoding">返回内容编码</param> /// <param name="others">其他需要补充的HTTP协议头</param> /// <returns>请求结果</returns> public static string Post(string url, byte[] body, CookieContainer cookieContainer, Encoding encoding, Dictionary<string, string> others = null) { return InternalHttp(HttpMethod.POST, url, body, cookieContainer, encoding, others); } /// <summary> /// Http操作 /// </summary> /// <param name="method">请求方式</param> /// <param name="url">请求地址</param> /// <param name="bytes">提交的数据</param> /// <param name="cookieContainer">Cookies存储器</param> /// <param name="encoding">返回内容编码</param> /// <param name="others">其他需要补充的HTTP协议头</param> /// <returns>请求结果</returns> static string InternalHttp(HttpMethod method, string url, byte[] bytes, CookieContainer cookieContainer, Encoding encoding, Dictionary<string, string> others = null) { string result = string.Empty; if (string.IsNullOrEmpty(url)) throw new ArgumentNullException("请求地址不能为空"); if (method == HttpMethod.POST) { if (bytes == null) throw new ArgumentNullException("提交的数据不能为空"); } if (cookieContainer == null) throw new ArgumentNullException("Cookies存储器不能为空"); try { HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url); request.Method = method.ToString(); request.UserAgent = "Mozilla/5.0 (Windows NT 6.1;MSIE 9.0;)"; request.CookieContainer = cookieContainer; if (method == HttpMethod.POST) { request.ContentType = "application/x-www-form-urlencoded"; request.ContentLength = bytes.Length; using (Stream stream = request.GetRequestStream()) { stream.Write(bytes, 0, bytes.Length); stream.Flush(); } } if (others != null) { foreach (KeyValuePair<string, string> pair in others) { request.Headers.Add(pair.Key, pair.Value); } } HttpWebResponse response = (HttpWebResponse)request.GetResponse(); if (response.StatusCode == HttpStatusCode.OK) { using (StreamReader reader = new StreamReader(response.GetResponseStream(), encoding)) { result = reader.ReadToEnd(); } } } catch (Exception ex) { Debug.WriteLine(ex.Message); } return result; } #endregion #region Socket static bool ValidateServerCertificate( object sender, X509Certificate certificate, X509Chain chain, SslPolicyErrors sslPolicyErrors) { /* if (sslPolicyErrors == SslPolicyErrors.None) return true; Console.WriteLine("Certificate error: {0}", sslPolicyErrors); return false; */ return true; } public static HttpResponse Get(IPEndPoint endpoint, HttpHeader header) { return Get(endpoint, header, null); } public static HttpResponse Get(IPEndPoint endpoint, HttpHeader header, X509CertificateCollection x509certs) { return InternalSslSocketHttp(HttpMethod.GET, endpoint, header, x509certs); } public static HttpResponse Post(IPEndPoint endpoint, HttpHeader header) { return Post(endpoint, header, null); } public static HttpResponse Post(IPEndPoint endpoint, HttpHeader header, X509CertificateCollection x509certs) { return InternalSslSocketHttp(HttpMethod.POST, endpoint, header, x509certs); } static HttpResponse InternalSslSocketHttp(HttpMethod method, IPEndPoint endpoint, HttpHeader header, X509CertificateCollection x509certs) { HttpResponse response = null; try { TcpClient tcp = new TcpClient(); tcp.Connect(endpoint); if (tcp.Connected) { byte[] buff = ParseHttpHeaderToBytes(method, header); //生成协议包 if (x509certs != null) { using (SslStream ssl = new SslStream(tcp.GetStream(), false, new RemoteCertificateValidationCallback(ValidateServerCertificate), null)) { ssl.AuthenticateAsClient("SslServerName", x509certs, SslProtocols.Tls, false); if (ssl.IsAuthenticated) { ssl.Write(buff); ssl.Flush(); response = ReadResponse(ssl); } } } else { using (NetworkStream ns = tcp.GetStream()) { ns.Write(buff, 0, buff.Length); ns.Flush(); response = ReadResponse(ns); } } } } catch (Exception ex) { Debug.WriteLine(ex.Message); } return response; } class TaskArguments { public TaskArguments(CancellationTokenSource cancelSource, Stream sm) { this.CancelSource = cancelSource; this.Stream = sm; } public CancellationTokenSource CancelSource { get; private set; } public Stream Stream { get; private set; } } private static HttpResponse ReadResponse(Stream sm) { HttpResponse response = null; CancellationTokenSource cancelSource = new CancellationTokenSource(); Task<string> myTask = Task.Factory.StartNew<string>( new Func<object, string>(ReadHeaderProcess), new TaskArguments(cancelSource,sm), cancelSource.Token); if (myTask.Wait(3 * 1000)) //尝试3秒时间读取协议头 { string header = myTask.Result; if (!string.IsNullOrEmpty(header)) { if (header.StartsWith("HTTP/1.1 100")) { return ReadResponse(sm); } byte[] buff = null; int start = header.ToUpper().IndexOf("CONTENT-LENGTH"); int content_length = -1; //fix bug if (start > 0) { string temp = header.Substring(start, header.Length - start); string[] sArry = Regex.Split(temp, "\r\n"); content_length = Convert.ToInt32(sArry[0].Split(':')[1]); if (content_length > 0) { buff = new byte[content_length]; int inread = sm.Read(buff, 0, buff.Length); while (inread < buff.Length) { inread += sm.Read(buff, inread, buff.Length - inread); } } } else { start = header.ToUpper().IndexOf("TRANSFER-ENCODING: CHUNKED"); if (start > 0) { buff = ChunkedReadResponse(sm); } else { buff = SpecialReadResponse(sm);//例外 } } response = new HttpResponse(header, buff); } } else { cancelSource.Cancel(); //超时的话,别忘记取消任务哦 } return response; } static string ReadHeaderProcess(object args) { TaskArguments argument = args as TaskArguments; StringBuilder bulider = new StringBuilder(); if (argument != null) { Stream sm = argument.Stream; while (!argument.CancelSource.IsCancellationRequested) { try { int read = sm.ReadByte(); if (read != -1) { byte b = (byte)read; bulider.Append((char)b); string temp = bulider.ToString(); if (temp.EndsWith("\r\n\r\n"))//Http协议头尾 { break; } } else { break; } } catch (Exception ex) { Debug.WriteLine(ex.Message); break; } } } return bulider.ToString(); } class ArraySegmentList<T> { List<ArraySegment<T>> m_SegmentList = new List<ArraySegment<T>>(); public ArraySegmentList() { } int m_Count = 0; public void Add(ArraySegment<T> arraySegment) { m_Count += arraySegment.Count; m_SegmentList.Add(arraySegment); } public T[] ToArray() { T[] array = new T[m_Count]; int index = 0; for (int i = 0; i < m_SegmentList.Count; i++) { ArraySegment<T> arraySegment = m_SegmentList[i]; Array.Copy(arraySegment.Array, 0, array, index, arraySegment.Count); index += arraySegment.Count; } return array; } } static byte[] ChunkedReadResponse(Stream sm) { ArraySegmentList<byte> arraySegmentList = new ArraySegmentList<byte>(); int chunked = GetChunked(sm); while (chunked > 0) { byte[] buff = new byte[chunked]; try{ int inread = sm.Read(buff, 0, buff.Length); while (inread < buff.Length) { inread += sm.Read(buff, inread, buff.Length - inread); } arraySegmentList.Add(new ArraySegment<byte>(buff)); if (sm.ReadByte() != -1)//读取段末尾的\r\n { sm.ReadByte(); } }catch(Exception){ break; } chunked = GetChunked(sm); } return arraySegmentList.ToArray(); } static int GetChunked(Stream sm) { int chunked = 0; StringBuilder bulider = new StringBuilder(); while (true) { try { int read = sm.ReadByte(); if (read != -1) { byte b = (byte)read; bulider.Append((char)b); string temp = bulider.ToString(); if (temp.EndsWith("\r\n")) { chunked = Convert.ToInt32(temp.Trim(), 16); break; } } else { break; } } catch (Exception ex) { Debug.WriteLine(ex.Message); break; } } return chunked; } /* * 注意:该方法仅供测试,实际使用时请根据需要定制 */ static byte[] SpecialReadResponse(Stream sm) { ArrayList array = new ArrayList(); StringBuilder bulider = new StringBuilder(); int length = 0; DateTime now = DateTime.Now; while (true) { byte[] buff = new byte[1024 * 10]; int len = sm.Read(buff, 0, buff.Length); if (len > 0) { length += len; byte[] reads = new byte[len]; Array.Copy(buff, 0, reads, 0, len); array.Add(reads); bulider.Append(Encoding.Default.GetString(reads)); } string temp = bulider.ToString(); if (temp.ToUpper().Contains("</HTML>")) { break; } if (DateTime.Now.Subtract(now).TotalSeconds >= 30) { break;//超时30秒则跳出 } } byte[] bytes = new byte[length]; int index = 0; for (int i = 0; i < array.Count; i++) { byte[] temp = (byte[])array[i]; Array.Copy(temp, 0, bytes, index, temp.Length); index += temp.Length; } return bytes; } #endregion #region Helper /// <summary> /// 将HTTP协议头转换为Bytes数据 /// </summary> /// <param name="method">HTTP方法</param> /// <param name="header">HTTP协议头</param> /// <returns>Bytes数据</returns> static byte[] ParseHttpHeaderToBytes(HttpMethod method, HttpHeader header) { StringBuilder bulider = new StringBuilder(); if (method.Equals(HttpMethod.POST)) { bulider.AppendLine(string.Format("POST {0} HTTP/1.1", header.Url)); bulider.AppendLine("Content-Type: application/x-www-form-urlencoded"); } else { bulider.AppendLine(string.Format("GET {0} HTTP/1.1", header.Url)); } if (!string.IsNullOrEmpty(header.Host)) bulider.AppendLine(string.Format("Host: {0}", header.Host)); bulider.AppendLine("User-Agent: Mozilla/5.0 (Windows NT 6.1; IE 9.0)"); if (!string.IsNullOrEmpty(header.Referer)) bulider.AppendLine(string.Format("Referer: {0}", header.Referer)); bulider.AppendLine("Connection: keep-alive"); if (!string.IsNullOrEmpty(header.Accept)) { bulider.AppendLine(string.Format("Accept: {0}", header.Accept)); } else { bulider.AppendLine("Accept: */*"); } if (!string.IsNullOrEmpty(header.Cookies)) bulider.AppendLine(string.Format("Cookie: {0}", header.Cookies)); if (method.Equals(HttpMethod.POST)) { bulider.AppendLine(string.Format("Content-Length: {0}\r\n", Encoding.Default.GetBytes(header.Body).Length)); bulider.Append(header.Body); } else { bulider.Append("\r\n"); } return Encoding.Default.GetBytes(bulider.ToString()); } /// <summary> /// 从Url中提取Host信息 /// </summary> /// <param name="url">Url</param> /// <returns>Host信息</returns> public static string GetHost(string url) { string host = string.Empty; try { Uri uri = new Uri(url); host = uri.Host; } catch (Exception ex) { Debug.WriteLine(ex.Message); } return host; } /// <summary> /// 通过Host获取IP地址 /// </summary> /// <param name="host">Host</param> /// <returns>IP地址</returns> public static IPAddress GetAddress(string host) { IPAddress address = IPAddress.Any; try { IPAddress[] alladdress = Dns.GetHostAddresses(host); if (alladdress.Length > 0) { address = alladdress[0]; } } catch (Exception ex) { Debug.WriteLine(ex.Message); } return address; } /// <summary> /// 从HTTP返回头协议中取Set-Cookie信息(即Cookies) /// </summary> /// <param name="responseHeader">HTTP返回头协议</param> /// <returns>Cookies</returns> public static string GetCookies(string responseHeader) { StringBuilder cookies = new StringBuilder(); using (StringReader reader = new StringReader(responseHeader)) { string strLine = reader.ReadLine(); while (strLine != null) { if (strLine.StartsWith("Set-Cookie:")) { string temp = strLine.Remove(0, 12); if (!temp.EndsWith(";")) { temp = temp + ";"; } cookies.Append(temp); } strLine = reader.ReadLine(); } } return cookies.ToString(); } /// <summary> /// 从HTTP返回头协议中去Location地址(一般出现在301跳转) /// </summary> /// <param name="responseHeader">HTTP返回头协议</param> /// <returns>Location地址</returns> public static string GetLocation(string responseHeader) { string result = string.Empty; using (StringReader reader = new StringReader(responseHeader)) { string strLine = reader.ReadLine(); while (strLine != null) { if (strLine.StartsWith("Location:")) { result = strLine.Remove(0, 10); } strLine = reader.ReadLine(); } } return result; } #endregion } }