import time
import requests
import re
import os
from bs4 import BeautifulSoup
from requests.exceptions import ConnectionError, ReadTimeout
import csv
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Connection':'keep-alive',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Cache-Control': 'max-age=0',
'Upgrade-Insecure-Requests': '1',
'Host': 'www.onezh.com',
'Cookie':'ASPSESSIONIDSATDATCA=AEEMLFFCELCFPHHDIOAPHPDL; Hm_lvt_51dcde608866b4a2f384527afc7057d8=1577428332; UM_distinctid=16f460e2d0cb0e-00ad1bf9aa4d99-6701b35-144000-16f460e2d0d8e0; bdshare_firstime=1577450426774; CNZZDATA1254894505=2145400881-1577425592-http%253A%252F%252Fwww.onezh.com%252F%7C1577452670; Hm_lpvt_51dcde608866b4a2f384527afc7057d8=1577457302',
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1',
}
def getURL(url):
a = requests.get(url, headers=headers)
html = a.text
return html
listurl=['工业','先进','制造','先进制造','设备','装备','技术','精密','智能制造','精密','加工','前瞻','新材料','大湾区']
ci=['上海','浙江','福建','广东','广西','海南','台湾','香港','澳门']
urllist12=[
'http://www.onezh.com/zhanhui/1_0_0_0_20200101/20200131/',
'http://www.onezh.com/zhanhui/1_0_0_0_20200201/20200229/',
'http://www.onezh.com/zhanhui/1_0_0_0_20200301/20200331/',
'http://www.onezh.com/zhanhui/1_0_0_0_20200401/20200430/',
'http://www.onezh.com/zhanhui/1_0_0_0_20200501/20200531/',
'http://www.onezh.com/zhanhui/1_0_0_0_20200601/20200630/',
'http://www.onezh.com/zhanhui/1_0_0_0_20200701/20200731/',
'http://www.onezh.com/zhanhui/1_0_0_0_20200801/20200831/',
'http://www.onezh.com/zhanhui/1_0_0_0_20200901/20200930/',
'http://www.onezh.com/zhanhui/1_0_0_0_20201001/20201031/',
'http://www.onezh.com/zhanhui/1_0_0_0_20201101/20201130/',
'http://www.onezh.com/zhanhui/1_0_0_0_20201201/20201231/']
def doDown():
c=0
listlist=['ppp']
with open('数据2.csv', 'a', encoding='utf-8-sig') as f:
csv_writer = csv.writer(f, delimiter=',')
csv_writer.writerow(['序号', '等级(为空,需要人工辅助)', 'Time(展会时间)日期限定为2020年的', 'Industry产业类别',
'Paticipation Status(留空)', 'Name(cn)展会名称(中文)', 'Name(en)展会名称(英文)', 'Place展览地点','Square使用面积','Organizer主办方','Exhibits展会简介','Website网址','Exhibitors参展商数量','Visitors观众数量','Target目标'])
for yue in urllist12:
print('根据月份URL为:', yue)
soup = BeautifulSoup(getURL(yue), 'html.parser')
yeshu=soup.find("span", {
"class": "Total"}).get_text().replace('共','').replace('页','')
for ye in range(1,int(yeshu)+1):
pageurl='http://www.onezh.com/zhanhui/' + str(ye) + yue[-25:]
soup4 = BeautifulSoup(getURL(pageurl), 'html.parser')
urlList = soup4.find("div", {
"class": "jxs_list"}).find_all("div", {
"class": "row"})
for u in urlList:
guanjiazi=u.findAll('a')[0]['title']
diqu2=u.find("div", {
"class": "cont"}).find("em", {
"class": "cgree1"}).get_text()
for u2 in listurl:
if u2 in guanjiazi:
for c2 in ci:
url2 = 'http://www.onezh.com' + u.find('a')['href']
try:
if c2 in diqu2:
print('关键字:', u2, '地区:', c2, '详情连接:', url2)
soup2 = BeautifulSoup(getURL(url2), 'html.parser')
shijian=soup2.findAll("div", {
"class": "bao-key"})[0].get_text().replace('纠错','')
chanye=soup2.find("ul", {
"class": "tuan-l28"}).findAll("li")[0].get_text()
mingcheng=soup2.findAll('title')[0].get_text()
didian = soup2.findAll("div", {
"class": "bao-key"})[1].get_text() + \
soup2.findAll("div", {
"class": "bao-key"})[1].find("span").get_text()
zhuban=soup2.find("dl", {
"class": "tuan-info mp5"}).find("dd").get_text().replace('承办单位', ' 承办单位')
jianjie=soup2.find("div", {
"class": "article zhjs-1 zhjs_on"}).get_text()
url3=soup2.find("div", {
"class": "top_dealer_1"}).findAll("li")
for uu in url3:
if '网址' in uu.find('b').get_text():
url4=uu.get_text()
k=0
for list2 in listlist:
if url4.replace('网址:', '')==list2:
print('已经存在了:', url4.replace('网址:', ''))
k=1
break
if k==0:
listlist.append(url4.replace('网址:', ''))
with open('数据2.csv', 'a', encoding='utf-8-sig', newline='') as f:
csv_writer = csv.writer(f, delimiter=',')
c += 1
csv_writer.writerow(
[str(c), '', str(shijian), str(chanye), '', str(mingcheng), '',
str(didian[:didian.index('乘车路线')]), '',
str(zhuban[zhuban.index('承办单位') + 5:]), str(jianjie),
str(url4.replace('网址:', '')), '', '', ''])
except Exception as ex:
print("出现如下异常%s" % ex, url2)
continue
if __name__ == '__main__':
doDown()
print('结束!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')