Python3爬取某2020年展览数据并保存到Excel文件中(完整源码)

# -*- coding=UTF-8 -*-
import time
import requests
import re
import os
from bs4 import BeautifulSoup
from requests.exceptions import ConnectionError, ReadTimeout
import csv

##
headers = {
     
           'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
           'Connection':'keep-alive',
           'Accept-Encoding': 'gzip, deflate, br',
           'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
           'Cache-Control': 'max-age=0',
           'Upgrade-Insecure-Requests': '1',
           'Host': 'www.onezh.com',
           'Cookie':'ASPSESSIONIDSATDATCA=AEEMLFFCELCFPHHDIOAPHPDL; Hm_lvt_51dcde608866b4a2f384527afc7057d8=1577428332; UM_distinctid=16f460e2d0cb0e-00ad1bf9aa4d99-6701b35-144000-16f460e2d0d8e0; bdshare_firstime=1577450426774; CNZZDATA1254894505=2145400881-1577425592-http%253A%252F%252Fwww.onezh.com%252F%7C1577452670; Hm_lpvt_51dcde608866b4a2f384527afc7057d8=1577457302',
           'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1',
           }    # 创建头部信息

def getURL(url): #发送网络请求
    a = requests.get(url, headers=headers)
    #a.encoding = 'utf-8'  # 改变乱码问题
    html = a.text
    #print(html)
    return html

listurl=['工业','先进','制造','先进制造','设备','装备','技术','精密','智能制造','精密','加工','前瞻','新材料','大湾区']
ci=['上海','浙江','福建','广东','广西','海南','台湾','香港','澳门']

urllist12=[
    'http://www.onezh.com/zhanhui/1_0_0_0_20200101/20200131/',
    'http://www.onezh.com/zhanhui/1_0_0_0_20200201/20200229/',
    'http://www.onezh.com/zhanhui/1_0_0_0_20200301/20200331/',
    'http://www.onezh.com/zhanhui/1_0_0_0_20200401/20200430/',
    'http://www.onezh.com/zhanhui/1_0_0_0_20200501/20200531/',
    'http://www.onezh.com/zhanhui/1_0_0_0_20200601/20200630/',
    'http://www.onezh.com/zhanhui/1_0_0_0_20200701/20200731/',
    'http://www.onezh.com/zhanhui/1_0_0_0_20200801/20200831/',
    'http://www.onezh.com/zhanhui/1_0_0_0_20200901/20200930/',
    'http://www.onezh.com/zhanhui/1_0_0_0_20201001/20201031/',
    'http://www.onezh.com/zhanhui/1_0_0_0_20201101/20201130/',
    'http://www.onezh.com/zhanhui/1_0_0_0_20201201/20201231/']

def doDown():
    # BeautifulSoup是一个HTML / XML解析器
    # soup中是整个html对象
    c=0
    listlist=['ppp']
    with open('数据2.csv', 'a', encoding='utf-8-sig') as f:
        csv_writer = csv.writer(f, delimiter=',')
        csv_writer.writerow(['序号', '等级(为空,需要人工辅助)', 'Time(展会时间)日期限定为2020年的', 'Industry产业类别',
                             'Paticipation Status(留空)', 'Name(cn)展会名称(中文)', 'Name(en)展会名称(英文)', 'Place展览地点','Square使用面积','Organizer主办方','Exhibits展会简介','Website网址','Exhibitors参展商数量','Visitors观众数量','Target目标'])

    #按照关键词循环
    for yue in urllist12:
        print('根据月份URL为:', yue)
        #获取数据总数
        soup = BeautifulSoup(getURL(yue), 'html.parser')  # 解析爬取网址
        #print(soup)
        #获取总页数
        yeshu=soup.find("span", {
     "class": "Total"}).get_text().replace('共','').replace('页','')
        for ye in range(1,int(yeshu)+1):
            #拼接出每个页面的URL
            pageurl='http://www.onezh.com/zhanhui/' + str(ye) + yue[-25:]
            #访问每个页面
            soup4 = BeautifulSoup(getURL(pageurl), 'html.parser')  # 解析爬取网址
            urlList = soup4.find("div", {
     "class": "jxs_list"}).find_all("div", {
     "class": "row"})  # 当前页面所有的数据
            #print(urlList)
            for u in urlList:
                #判断地区是否对应,展会名称是否包含关键字
                guanjiazi=u.findAll('a')[0]['title']
                diqu2=u.find("div", {
     "class": "cont"}).find("em", {
     "class": "cgree1"}).get_text()
                for u2 in listurl:
                    if u2 in guanjiazi: # 工业关键字比对
                        for c2 in ci:
                            url2 = 'http://www.onezh.com' + u.find('a')['href']
                            try:
                                #print('地区字典为:',c2)
                                if c2 in diqu2:
                                    print('关键字:', u2, '地区:', c2, '详情连接:', url2)
                                    #print('匹配成功!!')
                                    #print('名称关键字字典:', u2, '名称:', guanjiazi)
                                    #print('地区关键字字典:', c2, '名称:', diqu2)
                                    #print(guanjiazi)
                                    #print(u) #每个链接
                                    soup2 = BeautifulSoup(getURL(url2), 'html.parser')  # 解析爬取详情网页
                                    #print(soup2)
                                    #print('===============================================')

                                    #3.Time(展会时间)日期限定为2020年的
                                    shijian=soup2.findAll("div", {
     "class": "bao-key"})[0].get_text().replace('纠错','')
                                    #print('展会时间为:', shijian)

                                    chanye=soup2.find("ul", {
     "class": "tuan-l28"}).findAll("li")[0].get_text()
                                    #print('4.Industry产业类别:', chanye)

                                    mingcheng=soup2.findAll('title')[0].get_text()
                                    #print('6.Name(cn)展会名称(中文):', mingcheng)
                                    didian = soup2.findAll("div", {
     "class": "bao-key"})[1].get_text() + \
                                             soup2.findAll("div", {
     "class": "bao-key"})[1].find("span").get_text()
                                    #print('8.Place展览地点:', didian[:didian.index('乘车路线')])
                                    zhuban=soup2.find("dl", {
     "class": "tuan-info mp5"}).find("dd").get_text().replace('承办单位', '  承办单位')
                                    #print('10.Organizer主办方:', zhuban[zhuban.index('承办单位')+5:])

                                    jianjie=soup2.find("div", {
     "class": "article zhjs-1 zhjs_on"}).get_text()
                                    #print('11.Exhibits展会简介:',jianjie)

                                    url3=soup2.find("div", {
     "class": "top_dealer_1"}).findAll("li")
                                    #print(url3)
                                    #print('=============')
                                    for uu in url3:
                                        #print(uu)
                                        #print('222222222')
                                        if '网址' in uu.find('b').get_text():
                                            url4=uu.get_text()
                                    #print('Website网址:',url4.replace('网址:',''))

                                    #看网址连接是否已经存在
                                    #listlist.append(url4)
                                    #print('已存在url集合为:',listlist)
                                    #print('url为',url4)
                                    #判断连接是否已经存在集合中
                                    k=0
                                    for list2 in listlist:
                                        #print('list2:',list2)
                                        if url4.replace('网址:', '')==list2:
                                            print('已经存在了:', url4.replace('网址:', ''))
                                            k=1
                                            break
                                    if k==0:
                                        #print('添加到集合中:',url4.replace('网址:', ''))
                                        listlist.append(url4.replace('网址:', ''))
                                        with open('数据2.csv', 'a', encoding='utf-8-sig', newline='') as f:
                                            csv_writer = csv.writer(f, delimiter=',')
                                            c += 1
                                            csv_writer.writerow(
                                                [str(c), '', str(shijian), str(chanye), '', str(mingcheng), '',
                                                 str(didian[:didian.index('乘车路线')]), '',
                                                 str(zhuban[zhuban.index('承办单位') + 5:]), str(jianjie),
                                                 str(url4.replace('网址:', '')), '', '', ''])

                            except Exception as ex:
                                print("出现如下异常%s" % ex, url2)
                                continue

if __name__ == '__main__':
    doDown() # 执行下载方法
    print('结束!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')

你可能感兴趣的:(Python,爬虫)