以编程方式抓取中国电信电子发票

电子发票通常是以pdf文件存在,各大电商几乎都提供电子发票,如京东、淘宝(天猫)、苏宁易购、携程、中国联通、电信、移动等平台。那么,我们如何以编程方式爬取这些平台的电子发票呢?好了,这里我直接上代码供参考,实际上,经测试,各大电商平台的电子发票都是可以爬取的。欢迎加qq283335746共同探讨以编程方式爬虫抓取数据信息。

/// 
    /// 中国电信(189.cn)
    /// 
    public class ChinaTelecom
    {
        private ChinaTelecom() { }
        public ChinaTelecom(string userId, string cookieAppend)
        {
            this._userId = userId;
            this._userCookie = cookieAppend;
            this._relationUserId = Common.CookieFilter(_userCookie, string.Format(@"\s*{0}\s*=\s*(.|\n)*?;", "userId"));
            GetCityDomain();
            this._thirdPartyRequest = new ThirdPartyRequest(cookieAppend, ThirdPartyOptions.ChinaTelecom);
            GetYearMonths();
            this.OrderInvoices = new List();
            this._invoiceBll = new ThirdPartyBll(_userId, _relationUserId, ThirdPartyOptions.ChinaTelecom);
            this._client = new NetClient(BaseUrl);
        }

        internal static string CityHost = "http://www.189.cn{0}";
        internal static string BaseUrl = "http://{0}.189.cn";
        internal const string QueryInvoicesFirstUrl = "{0}/pages/selfservice/usercomplaintsinfo/queryNetInvoice.action?pageDataNum=10";
        internal const string QueryInvoicesUrl = "{0}/pages/selfservice/usercomplaintsinfo/queryNetInvoice.action?pageDataNum=12&acctMonth={1}";
        internal const string DownloadInvoiceUrl = "{0}/pages/selfservice/usercomplaintsinfo/DownloadNetInvoice.action?index=1";

        private readonly string _userId = string.Empty;
        private readonly string _userCookie = string.Empty;
        private readonly string _relationUserId = string.Empty;
        private readonly NetClient _client;
        private readonly ThirdPartyRequest _thirdPartyRequest;
        private readonly ThirdPartyBll _invoiceBll;
        private static List YearMonths { get; set; }
        private List OrderInvoices { get; set; }

        /// 
        /// 发票处理入口
        /// 
        /// 
        public async Task ExecuteInvoiceAsync()
        {
            if (Log.IsDebugEnabled) Log.Debug("ChinaTelecom.ExecuteInvoiceAsync is starting,UserId:{0},RelationUserId:{1}", _userId, _relationUserId);

            //下载并解析出订单发票信息
            await GetOrderInvoicesAsync();

            //保存得到的发票信息集
            await _invoiceBll.SaveDownloadInvoice(OrderInvoices);
        }

        /// 
        /// 获取包含订单发票信息集
        /// 
        /// 
        private async Task GetOrderInvoicesAsync()
        {
            foreach (var yearMonth in YearMonths)
            {
                var index = YearMonths.FindIndex(m=>m.Equals(yearMonth));
                var queryInvoicesUrl = index == 0 ? string.Format(QueryInvoicesFirstUrl,BaseUrl) : string.Format(QueryInvoicesUrl, BaseUrl, yearMonth);

                var request = _thirdPartyRequest.CreateRequest(queryInvoicesUrl, index == 0 ? "diagnostics://4/" : string.Format(QueryInvoicesUrl, BaseUrl, YearMonths[index - 1]));
                var response = await _client.ExecuteAsync(request);
                if (response.ContentLength == 0) return;

                var downloadInvoiceInfo = await DownloadInvoiceAsync(response.ResponseUri.ToString());
                if (downloadInvoiceInfo == null) continue;

                OrderInvoices.Add(downloadInvoiceInfo);
            }
        }

        /// 
        /// 下载并解析出电子发票信息
        /// 
        /// 
        /// 
        private async Task DownloadInvoiceAsync(string refererUrl)
        {
            var request = _thirdPartyRequest.CreateRequest(string.Format(DownloadInvoiceUrl, BaseUrl), refererUrl);
            var response = await _client.ExecuteAsync(request);
            if (response.ContentLength == 0) return null;

            var tempPath = FileHelper.GetTempFileName(string.Format("{0}.pdf", Guid.NewGuid()));
            await FileHelper.Save(response.RawBytes, tempPath);

            var invoiceInfo = await _invoiceBll.GetInvoiceInfoByFile(tempPath);
            if (string.IsNullOrEmpty(invoiceInfo.InvoiceCode) || string.IsNullOrEmpty(invoiceInfo.InvoiceNumber)) return null;

            //解析出发票信息后,将临时文件移至正式存储路径
            var filePath = FileHelper.GetFilePath("ChinaTelecom", string.Format("{0}-{1}.pdf", invoiceInfo.InvoiceCode, invoiceInfo.InvoiceNumber));
            FileHelper.MoveFile(tempPath, filePath);
            FileHelper.MoveFile(tempPath.Replace(".pdf", ".jpg"), filePath.Replace(".pdf", ".jpg"));
            invoiceInfo.FileUrl = FileHelper.ToVirtualUrl(filePath);
            invoiceInfo.Picture = invoiceInfo.FileUrl.Replace(".pdf", ".jpg");

            var downloadInvoiceInfo = new DownloadInvoiceInfo { FilePath = filePath,InvoiceInfo = invoiceInfo,RefererUrl = refererUrl,DownloadUrl = request.Resource };

            return downloadInvoiceInfo;
        }

        /// 
        /// 从Cookie中解析出cityCode,并得到对应的CityDomain
        /// 
        private void GetCityDomain()
        {
            var cityCode = Common.CookieFilter(_userCookie, string.Format(@"\s*{0}\s*=\s*(.|\n)*?;", "cityCode"));
            if (string.IsNullOrEmpty(cityCode))
            {
                if (Log.IsWarnEnabled) Log.Warn("ChinaTelecom.GetCityDomain,cityCode is null");
                BaseUrl = string.Format(BaseUrl, "www");
                return;
            }

            CityHost = string.Format(CityHost, "/" + cityCode + "/");
            BaseUrl = string.Format(BaseUrl, cityCode);

            //var items = _userCookie.Split(new char[] {';'}, StringSplitOptions.RemoveEmptyEntries);
            //var cityCodeItem = items.FirstOrDefault(m => m.Contains("cityCode"));
            //if (cityCodeItem == null)
            //{
            //    if(Log.IsWarnEnabled) Log.Warn("ChinaTelecom.GetCityDomain,cityCode is null");
            //    BaseUrl = string.Format(BaseUrl, "www");
            //    return;
            //}
            //var arr = cityCodeItem.Split(new char[] {'='}, StringSplitOptions.RemoveEmptyEntries);
            //BaseUrl = string.Format(BaseUrl, arr.Length < 2 ? "www" : arr[1].Trim());
        }

        /// 
        /// 电信电子发票页支持查询的年月时间段选择条件
        /// 
        private void GetYearMonths()
        {
            var currentTime = DateTime.Now.AddMonths(-1);
            if (YearMonths == null) YearMonths = new List();
            else
            {
                if(YearMonths.Any(m=>m == currentTime.ToString("yyyyMM"))) return;
                YearMonths.Clear();
            }
            
            while (currentTime > GlobalConfig.MinValidateInvoiceDate && YearMonths.Count < 12)
            {
                YearMonths.Add(currentTime.ToString("yyyyMM"));
                currentTime = currentTime.AddMonths(-1);
            }
        }
    }

你可能感兴趣的:(电子发票,c#/.net爬虫抓取)