招标网招标信息爬取

本文实现在招标网站上爬取指定关键词的招标信息


#采购与招标信息网
#https://www.chinabidding.cn/
#中国电信外部门户招标信息
#https://42.99.33.26/MSS-PORTAL/account/login.do
#中国移动采购与招标网
#https://b2b.10086.cn/b2b/main/showBiao!preIndex.html?noticeType=list1
#中国移动广东门户招标信息
#http://www.telewiki.cn/supplier/viewLogin.action

import urllib.request
import re
import datetime
from lxml import etree
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

#定义查询的关键词
keywdL=['xxx','xxx','xxx']

#定义查询日期格式
dateFormat='%Y-%m-%d'

#获取系统当前时间
def get_today():
    h=datetime.datetime.now()
    today=h.strftime(dateFormat)
    return today

#获取查询起始时间
def get_startdate():
    h=datetime.datetime.now()-datetime.timedelta(days=10)
    startdate=h.strftime(dateFormat)
    return startdate

#获取系统当前时间字符串
def get_day():
    h=datetime.datetime.now()
    today=h.strftime('%Y%m%d')
    return today

#定义目标文件名
fileName1="采购与招标网招标信息"+get_day()+'.txt'
fileName2="中国电信外部门户网站招标信息"+get_day()+".txt"
fileName3="中国移动采购与招标网招标信息"+get_day()+".txt"
fileName4="中国移动广东公司供应商门户招标信息"+get_day()+".txt"

#打开网页,获取网页内容
def url_open(url):
    try:
        headers=("user-agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0")
        opener=urllib.request.build_opener()
        opener.addheaders=[headers]
        urllib.request.install_opener(opener)
        data=urllib.request.urlopen(url).read().decode("utf-8","ignore")
        return data
    except Exception as e:
        print(str(e))

#获取采购与招标网的招标信息
def get_content1(url):
    try:
        data=url_open(url)
        url_pat='
        title_pat='(.*?)'
        time_pat='
(.*?)
'
urlL=re.compile(url_pat).findall(data) titleL=re.compile(title_pat).findall(data) timeL=re.compile(time_pat).findall(data) if(len(urlL)>0): for i in range(len(urlL)): page_url="https://www.chinabidding.cn"+urlL[i] page_title=titleL[i] time=timeL[i] with open(fileName1,'a') as fh: fh.write("招标主题:"+page_title+"\n") fh.write("招标内容:"+page_url+"\n") fh.write("发布时间:"+time+"\n") fh.write("------------------------\n") except Exception as e: print(str(e)) #将数据保存到文本中 def caigouwang(): try: with open(fileName1,'a') as f: f.write("采购与招标网:https://www.chinabidding.cn/\n") for keywd in keywdL: keywords=urllib.request.quote(keywd) with open(fileName1,'a') as fh: fh.write("关键词:"+keywd+"\n") url="https://www.chinabidding.cn/search/searchzbw/search2?keywords="+keywords+"&table_type=&areaid=&categoryid=&b_date=week" print("正在查询关键词:"+keywd) get_content1(url) print("完成") except Exception as e: print(e) #获取中国电信外部门户招标信息 def get_content2(url): try: data=url_open(url) html = etree.HTML(data) page_titleL = html.xpath('//table[@class="table_data"]//td/a[@href="#"]/text()') #page_urlL = html.xpath('//table[@class="table_data"]//td/a[@href="#"]/@onclick') if(len(page_titleL)>0): for i in range(len(page_titleL)): page_title=page_titleL[i] #page_url=page_urlL[i].split(',')[0][6:-1] #page_url="http://www.telewiki.cn/notice/notice!queryNoticeDetail.action?noticeSO.noticeid="+page_urlL[i][5:-2] with open(fileName2,'a') as fh: fh.write("招标主题:"+page_title+"\n") #fh.write("招标内容:"+page_url+"\n") fh.write("-----------------\n") except Exception as e: print(e) #将数据保存到文本中 def dianxin(): try: with open(fileName2,'a') as f: f.write("中国电信外部门户网站:https://42.99.33.26/MSS-PORTAL/account/login.do\n") for keywd in keywdL: keywords=urllib.request.quote(keywd) proL=['JT','NJT'] for province in proL: url="https://42.99.33.26/MSS-PORTAL/announcementjoin/list.do?provinceJT="+province+"&docTitle="+keywords+"&docCode=&provinceCode=&startDate="+get_startdate()+"&endDate=&docType=&paging.start=1&paging.pageSize=40&pageNum=40&goPageNum=1" print("正在查询关键词("+province+"):"+keywd) with open(fileName2,'a') as fh: fh.write("关键词("+province+"):"+keywd+"\n") get_content2(url) print("完成") except Exception as e: print(e) #中国移动采购与招标网 def get_content3(url): try: data=url_open(url) html = etree.HTML(data) page_titleL = html.xpath('//td[@style="width:280px;"]/a[@href="#this"]/@title') if(len(page_titleL)>0): for i in range(len(page_titleL)): page_title=page_titleL[i] with open(fileName3,'a') as fh: fh.write("招标主题:"+page_title+"\n") fh.write("-----------------\n") except Exception as e: print(e) #将数据存入文本中 def yidong(): try: with open(fileName3,'a') as fh: fh.write("中国移动采购与招标:https://b2b.10086.cn/b2b/main/showBiao!preIndex.html?noticeType=list1\n") for keywd in keywdL: keywords=urllib.request.quote(keywd) url="https://b2b.10086.cn/b2b/main/listVendorNoticeResult.html?page.currentPage=1&page.perPageSize=40¬iceBean.sourceCH=¬iceBean.source=¬iceBean.title="+keywords+"¬iceBean.startDate="+get_startdate()+"¬iceBean.endDate=" with open(fileName3,'a') as fh: fh.write("关键词:"+keywd+"\n") print("正在查询关键词:"+keywd) get_content3(url) print("完成") except Exception as e: print(e) #中国移动广东门户招标信息 def get_content4(url): try: data=url_open(url) html = etree.HTML(data) page_titleL = html.xpath("//span[@class='ptitle']/a/text()") page_urlL = html.xpath("//span[@class='ptitle']/a/@onclick") if(len(page_titleL)>0): for i in range(len(page_titleL)): page_title=page_titleL[i] page_url="http://www.telewiki.cn/notice/notice!queryNoticeDetail.action?noticeSO.noticeid="+page_urlL[i][5:-2] with open(fileName4,'a') as fh: fh.write("招标主题:"+page_title+"\n") fh.write("招标内容:"+page_url+"\n") fh.write("-----------------\n") except Exception as e: print(e) #将数据存入文本中 def guangdong(): try: with open(fileName4,'a') as fh: fh.write("中国移动广东公司供应商门户网站:http://www.telewiki.cn/supplier/viewLogin.action\n") for keywd in keywdL: keywords=urllib.request.quote(keywd) url="http://www.telewiki.cn/notice/notice!queryPurchaseList.action?random=0.000059963069461321794&queryListSO.queryProjectName="+keywords+"&queryListSO.queryRegionCompany=&queryListSO.queryOpMethod=&queryListSO.queryBegindate="+get_startdate()+"&queryListSO.queryEnddate="+get_today()+"&queryListSO.step=&queryListSO.applyState=&queryListSO.purchaseType=&queryListSO.status=0" with open(fileName4,'a') as fh: fh.write("关键词:"+keywd+"\n") print("正在查询关键词:"+keywd) get_content4(url) print("完成") except Exception as e: print(e) if __name__=='__main__': print("-----开始爬取采购与招标网------") caigouwang() print("-----开始爬取中国电信外部门户网站------") dianxin() print("-----开始爬取中国移动采购与招标网------") yidong() print("-----开始爬取中国移动广东公司供应商门户网站------") guangdong()

你可能感兴趣的:(爬虫)