汽车用户消费投诉数据爬取分析(Python爬虫)

多线程爬虫代码

"""
name:汽车用户消费投诉_品牌url爬取,已完成
author:zhangxiaoyu
"""
import _thread
import random
import re
import time

import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from sqlalchemy import create_engine


def new_headers():
    """
    生成随机的Headers
    :return: Headers字典
    """
    a = random.randint(1, 999)
    b = random.randint(1, 99)

    # 随机生成User-Agent
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/' + str(
            a) + '.' + str(b)
    }
    return headers


def get_url_for_all_brand():
    """
    爬取所有品牌对应的url,并写入数据库
    :return: 无输出
    """

    # 随机生成User-Agent
    headers = new_headers()

    # 进入浏览器设置
    options = webdriver.ChromeOptions()

    # 更换头部
    options.add_argument('user-agent=' + headers['User-Agent'])

    # 打开一个空的窗口
    driver = webdriver.Chrome(options=options)

    # 确定要打开的网址
    url = 'http://tousu.315che.com/tousulist/serial/93/'

    # 打开界面
    driver.get(url=url)

    # 获取网页的源代码
    source1 = driver.page_source
    # print(source)

    # 通过界面的Xpath定位并点击A-Z界面,如果越界则跳出循环

    source = driver.page_source

    car_name = re.findall('(.{1,40})', source)
    for i in car_name:
        print(i)

    car_href = re.findall('.{1,40}', source)
    for i in car_href:
        print(i)

    data = pd.DataFrame({
        'car_name': car_name,
        'car_href': car_href
    })

    # 链接数据库:mysql+pymysql://用户名:密码@地址:端口/数据库名?编码格式
    con = create_engine('mysql+pymysql://root:[email protected]:3306/python爬虫?charset=utf8')
    # 写入数据库
    data.to_sql('品牌url汇总', con=con, if_exists='append')

    print("成功")
    driver.close()


def download_url_for_all_brand():
    """
    从数据库下载所有品牌对应的url
    :return: 返回品牌名称和品牌对应的链接
    """

    # 用sqlalchemy构建数据库链接engine
    con = 'mysql+pymysql://root:[email protected]:3306/python爬虫?charset=utf8'
    engine = create_engine(con)
    # sql 命令
    sql_cmd = "SELECT * FROM 品牌url汇总"

    url_brand = pd.read_sql(sql=sql_cmd, con=engine)[['car_name', 'car_href']]
    # print(url_brand)
    return url_brand


def get_brand_detail_url(brand, brand_url):
    """
    name:具体品牌的界面爬取
    author:zhangxiaoyu
    """
    all_detail_url_list = []

    # 随机生成User-Agent
    headers = new_headers()

    # 进入浏览器设置
    options = webdriver.ChromeOptions()

    # 更换头部
    options.add_argument('user-agent=' + headers['User-Agent'])

    # 打开一个空的窗口
    driver = webdriver.Chrome(options=options)

    # 打开界面
    driver.get(url=brand_url)

    # 获取网页的源代码
    source = driver.page_source
    # print(source)

    page_num = re.findall('共(.+)页', source)
    print("页数:".format(page_num))
    # print(driver.current_url)

    if len(page_num) > 0:
        # 获取所有的页数
        for page in range(1, int(page_num[0]) + 1):
            brand_url_page = driver.current_url + "/0/0/0/" + str(page) + ".htm"
            print(brand_url_page)

            try:
                # 获取新的网页的源代码
                response = requests.get(url=brand_url_page, headers=headers)
                response.encoding = 'utf-8'
                source = response.text

                # 获取每个评论对应的url
                soup = BeautifulSoup(source, 'lxml')
                # print(soup)
                soup1 = soup.find_all(class_="tousu-filter-list")
                # print(soup1)
                detail_url_list = re.findall('', str(soup1))
                print(detail_url_list)
                for feedback_url in detail_url_list:
                    all_detail_url_list.append(feedback_url)
                time.sleep(1)
            except:
                pass

        # 对数据进行去重
        all_detail_url_set = set(all_detail_url_list)
        all_detail_url_list = list(all_detail_url_set)

        # 去访问它的子页面
        print(all_detail_url_list)
        for feedback_url in all_detail_url_list:
            # print(feedback_url)
            try:
                # 获取附属界面信息
                get_feedback(brand, feedback_url)
                print(feedback_url + "成功")
            except:
                print(feedback_url + "出错!!!")


def get_feedback(brand, feedback_url):
    """
    :param brand: 汽车品牌
    :param feedback_url: 具体评论的url
    :return:
    """

    # 随机生成User-Agent
    a = random.randint(1, 999)
    b = random.randint(1, 99)
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/' + str(
            a) + '.' + str(b)
    }

    # 访问评论的url
    response = requests.get(url=feedback_url, headers=headers)

    # 设置网页编码
    response.encoding = 'utf-8'

    # 获取网页源码
    source = response.text

    # 获取单号
    feedback_no = re.findall('

单号:(.+)

', source) print(feedback_no) # if feedback_no[0] is "": # return 0 # 获取品牌车型 brand_model = re.findall('

品牌车型:(.+)

', source) # print(brand_model) # 投诉问题 feedback_question = re.findall('

诉求问题:(.+)

', source) print(feedback_question) # 投诉时间 feedback_time = re.findall('

投诉时间:(....-.{0,3}-.{0,3} ..:..:..)

', source) # print(feedback_time) # 经销商 shop = re.findall('

经销商:(.+)

', source) # print(shop) # 投诉具体内容 soup = BeautifulSoup(source, 'lxml') # print(soup) soup1 = soup.find_all(class_="describe") # print(soup1) mark = [i.get_text() for i in soup1] mark = mark[0][1:-1] # print(mark) # 投诉状态 soup2 = soup.find_all(class_="article-tag unsolved") status = re.findall('(.+)', str(soup2)) # print(status) data = pd.DataFrame({ 'feedback_no': feedback_no, 'brand': brand, 'brand_model': brand_model, 'feedback_question': feedback_question, 'mark': mark, 'feedback_time': feedback_time, 'shop': shop, 'status': status, 'feedback_url': feedback_url, }) # 链接数据库:mysql+pymysql://用户名:密码@地址:端口/数据库名?编码格式 con = create_engine('mysql+pymysql://root:[email protected]:3306/python爬虫?charset=utf8') # 写入数据库 data.to_sql('汽车用户消费投诉多线程', con=con, if_exists='append') time.sleep(0.5) # 多线程 def print_time(start, end): # 从数据库下载所有品牌对应的url url_brand = download_url_for_all_brand() for i in range(start, end): brand = url_brand.loc[i][0] brand_url = url_brand.loc[i][1] print(brand, brand_url) get_brand_detail_url(brand, brand_url) print("一个线程结束") if __name__ == '__main__': get_url_for_all_brand() _thread.start_new_thread(print_time, (0, 25)) _thread.start_new_thread(print_time, (25, 50)) _thread.start_new_thread(print_time, (50, 75)) _thread.start_new_thread(print_time, (75, 100)) _thread.start_new_thread(print_time, (100, 125)) _thread.start_new_thread(print_time, (125, 150)) _thread.start_new_thread(print_time, (150, 175)) _thread.start_new_thread(print_time, (175, 200)) _thread.start_new_thread(print_time, (200, 225)) _thread.start_new_thread(print_time, (225, 250)) _thread.start_new_thread(print_time, (250, 275)) _thread.start_new_thread(print_time, (275, 300)) _thread.start_new_thread(print_time, (300, 325)) _thread.start_new_thread(print_time, (325, 350)) _thread.start_new_thread(print_time, (350, 375)) _thread.start_new_thread(print_time, (375, 400)) _thread.start_new_thread(print_time, (400, 425)) _thread.start_new_thread(print_time, (425, 450)) _thread.start_new_thread(print_time, (450, 475)) _thread.start_new_thread(print_time, (475, 500)) _thread.start_new_thread(print_time, (500, 525)) _thread.start_new_thread(print_time, (525, 550)) _thread.start_new_thread(print_time, (550, 557)) while (1): pass

简单的数据清洗

# 数据清洗

import pandas as pd
import matplotlib.pyplot as plt

# 导入数据
data = pd.read_excel(r'D:\Desktop\汽车用户消费投诉.xlsx')
data

# # 检查DataFrame中每行数据是否有重复的数据行
# mask = data.duplicated()
# mask
# data[~mask]

# 通过DataFrame内置方法去除重复的行数据
data = data.drop_duplicates()
data

data.to_excel(r'D:\Desktop\汽车用户消费投诉数据.xlsx')

投诉最多的二十大车型

# 投诉最多的二十大车型

import pandas as pd
import matplotlib.pyplot as plt

# 导入数据
data = pd.read_excel(r'D:\Desktop\汽车用户消费投诉数据.xlsx')
# data

result = data.groupby(by='brand')[['brand']].count()
result['数量'] = data.groupby(by='brand_model')[['brand_model']].count()
result = result.sort_values(by='数量',ascending=False)

result = result.iloc[:20]

# 1.创建画布
plt.figure(figsize=(20,8),dpi=300)

# 2.绘图
# 绘制会员存量柱状图
plt.bar(result.index,result['数量'],width=0.5)

# 设置新绘图区y轴的刻度
yticks = range(0,2000,100)

# 3.展示
plt.show()

投诉最多的十大品牌

# 投诉最多的十大品牌

import pandas as pd
import matplotlib.pyplot as plt

# 导入数据
data = pd.read_excel(r'D:\Desktop\汽车用户消费投诉数据.xlsx')
# data

result = data.groupby(by='brand')[['brand']].count()
result['数量'] = data.groupby(by='brand')[['brand']].count()
result = result.sort_values(by='数量',ascending=False)

result = result.iloc[:10]

# 1.创建画布
plt.figure(figsize=(20,8),dpi=300)

# 2.绘图
# 绘制会员存量柱状图
plt.bar(result.index,result['数量'],width=0.5)

# 设置新绘图区y轴的刻度
yticks = range(0,2000,100)

# 3.展示
plt.show()

最不靠谱的10大经销商

# 最不靠谱的10大经销商

import pandas as pd
import matplotlib.pyplot as plt

# 导入数据
data = pd.read_excel(r'D:\Desktop\汽车用户消费投诉数据.xlsx')
data

result = data.groupby(by='shop')[['shop']].count()
result['数量'] = data.groupby(by='shop')[['shop']].count()
result = result.sort_values(by='数量',ascending=False)
result

result = result.iloc[1:11]

# 1.创建画布
plt.figure(figsize=(20,8),dpi=300)

# 2.绘图
# 绘制会员存量柱状图
plt.bar(result.index,result['数量'],width=0.5)

# 设置新绘图区y轴的刻度
yticks = range(0,2000,100)

# 3.展示
plt.show()

最近一年本网站接到的投诉数据趋势

# 最近一年本网站接到的投诉数据趋势

import pandas as pd
import matplotlib.pyplot as plt

# 导入数据
data = pd.read_excel(r'D:\Desktop\汽车用户消费投诉数据.xlsx')
data

# 2.数据处理
data['投诉年月'] = pd.DatetimeIndex(data['feedback_time']).strftime('%Y%m')
data.head()

result =  data.groupby(by='投诉年月')[['投诉年月']].count()
result['数量'] = data.groupby(by='投诉年月')[['投诉年月']].count()
result = result.iloc[-12:-1]
result

# 1.创建画布
plt.figure(figsize=(20,8),dpi=300)

# 2.绘图

# 绘制会员增量的折线图
ax = plt.twinx()
ax.plot(result.index,result['数量'],color='r')

# 3.展示
plt.show()

你可能感兴趣的:(Python)