爬虫项目-爬取领导留言板

对于有需要爬取领导留言板的朋友,可以留言,一起交流技术;如果是有需要帮忙爬取数据的也可私聊!

  1. 项目简介,本次项目爬取网站为:https://liuyan.people.com.cn/home

  2. 大致思路,首先使用爬取留言用户的ID,然后根据ID生成留言链接,再根据链接爬取相关字段信息,最后写入csv。

  3. 界面如图所示,如果想运行此项目,请浏览该界面。
    爬虫项目-爬取领导留言板_第1张图片

  4. 代码部分:首先先导入相关的包

import csv
import random
import re
import time
from selenium.common.exceptions import TimeoutException
import dateutil.parser as dparser
from random import choice
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

4.随取获取用户代理,模仿浏览器操作(避免被网站屏蔽)。

# 指定 Chrome WebDriver 的路径
driver = webdriver.Chrome(executable_path='/usr/local/bin/chromedriver', options=chrome_options)
## 时间节点
start_date = dparser.parse('2019-06-01')
## 浏览器设置选项
# chrome_options = Options()
chrome_options.add_argument('blink-settings=imagesEnabled=false')


def get_time():
    '''获取随机时间'''
    return round(random.uniform(3, 6), 1)


def get_user_agent():
    '''获取随机用户代理'''
    user_agents = [
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
        "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
        "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
        "Mozilla/5.0 (iPod; U; CPU iPhone OS 2_1 like Mac OS X; ja-jp) AppleWebKit/525.18.1 (KHTML, like Gecko) Version/3.1.1 Mobile/5F137 Safari/525.20",
        "Mozilla/5.0 (Linux;u;Android 4.2.2;zh-cn;) AppleWebKit/534.46 (KHTML,like Gecko) Version/5.1 Mobile Safari/10600.6.3 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)",
        "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"
    ]
    ## 在user_agent列表中随机产生一个代理,作为模拟的浏览器
    user_agent = choice(user_agents)
    return user_agent

5.三个核心函数,分别实现爬取用户ID、生成留言链接和爬取留言信息。

def get_detail_urls_by_keyword(keyword, list_url):
    '''获取包含特定关键字的留言链接'''
    user_agent = get_user_agent()
    chrome_options.add_argument('user-agent=%s' % user_agent)
    drivertemp = webdriver.Chrome(options=chrome_options)
    drivertemp.maximize_window()
    drivertemp.get(list_url)

    try:
        # 等待搜索框可点击
        search_box = WebDriverWait(drivertemp, 10).until(EC.element_to_be_clickable((By.XPATH, "/html/body/div[1]/aside/div[1]/div[2]/div/div/input")))

        # 检查搜索按钮是否可点击
        search_button = WebDriverWait(drivertemp, 10).until(EC.element_to_be_clickable((By.XPATH, "/html/body/div[1]/aside/div[1]/div[2]/span")))

        # 如果元素定位有效,就执行相应操作
        if search_box.is_enabled() and search_button.is_enabled():
            search_box.send_keys(keyword)
            search_button.click()
    except Exception as e:
        print("元素定位无效或发生异常:", e)

    # 继续操作输入框,模拟回车键


    time.sleep(2)  # 等待加载搜索结果页面

    # 循环加载页面或翻页,提取包含特定关键字的留言链接
    message_ids = []
    #detail_links = []
    page = 1
    while True:
        try:
            # 在搜索结果页面提取留言链接links = driver.find_elements_by_tag_name("a")
            # detail_elements = drivertemp.find_elements_by_tag_name('a')  # 请替换为实际的搜索结果元素XPath
            # for element in detail_elements:

            next_page_button = WebDriverWait(drivertemp, 30).until(
                EC.element_to_be_clickable((By.CLASS_NAME, "mordList")))

            if next_page_button.is_enabled():
                next_page_button.click()
                page += 1
            else:
                break
        except TimeoutException:
            # 当找不到下一页按钮时,抛出TimeoutException,结束循
            break
        time.sleep(3)

    message_elements = drivertemp.find_elements_by_xpath('//div[@class="headMainS fl"]//span[@class="t-mr1 t-ml1"]')
    for element in message_elements:
        message_id = element.text.strip().split(':')[-1]
        message_ids.append(message_id)

    drivertemp.quit()
    return message_ids

def get_information(urls):
    m_user = []  # 留言对象
    m_subject = []  # 主题
    m_type = []  # 类型
    m_star = []  # 收藏
    m_domain = []  # 领域
    m_state = []
    m_time = []  # 时间
    m_message = []  # 留言
    m_reply = []  # 回复
    for url in urls:
        '''获取包含特定关键字的留言链接'''
        user_agent = get_user_agent()
        chrome_options.add_argument('user-agent=%s' % user_agent)
        drivertemp = webdriver.Chrome(options=chrome_options)
        drivertemp.maximize_window()
        drivertemp.get(url)
#        time.sleep(8)

        max_retries = 1
        for _ in range(max_retries):
            try:

                user_e = drivertemp.find_elements_by_xpath('//div[@class="replyObject fl"]//span[@style="cursor: pointer;"]')
                for user_element in user_e:
                    user = user_element.text.strip().split(':')[-1]
                    m_user.append(user)

                subject = drivertemp.find_elements_by_xpath('//div[@class="replyInfoHead clearfix"]//h1[@class="fl"]')
                for element in subject:
                    m_subject.append(element.text)

                type = drivertemp.find_elements_by_xpath('//p[@class="typeNameD"]')
                for element in type:
                    m_type.append(element.text)

                domain = drivertemp.find_elements_by_xpath('//p[@class="domainName"]')
                for element in domain:
                    m_domain.append(element.text)

                state = drivertemp.find_elements_by_xpath('//p[@class="stateInfo"]')
                for element in state:
                    m_state.append(element.text)

                star_elements = drivertemp.find_elements_by_xpath('//div[@class="replyInfoHeadIcon fr clearfix"]')
                #m_star = []  # 收藏
                for star_element in star_elements:
                    favorite_element = star_element.find_element_by_xpath('.//p[@title="收藏"]')
                    favorite_text = favorite_element.text.strip()
                    # 使用正则表达式提取数字部分
                    favorite_count = re.search(r'\d+', favorite_text)
                    if favorite_count:
                        m_star.append(favorite_count.group())

                time_elements = drivertemp.find_elements_by_xpath('//ul[@class="replyInfoName clearfix"]')
                for time_element in time_elements:
                    time_element = time_element.find_element_by_xpath(
                        './/span[contains(text(), "留言ID:")]/following-sibling::span')
                    reply_time = time_element.text.strip()
                    m_time.append(reply_time)

                message_elements = drivertemp.find_elements_by_xpath(
                    '//div[@class="clearfix replyContent"]//p[@id="replyContentMain"]')
                for message_element in message_elements:
                    message_text = message_element.text.strip()
                    m_message.append(message_text)

                reply_elements = drivertemp.find_elements_by_xpath('//div[@class="replyHandleMain fl"]//p[@class="handleContent noWrap sitText"]')
                for reply_element in reply_elements:
                    reply_text = reply_element.text.strip()
                    m_reply.append(reply_text)

            except Exception as e:
                print(f"An error occurred: {str(e)}")
                # 页面加载失败,刷新页面
                drivertemp.refresh()
                time.sleep(5)  # 等待5秒,或根据需要进行调整

    drivertemp.quit()
    return m_user,m_subject,m_type,m_star,m_domain,m_state,m_time,m_message,m_reply

def generate_urls_from_numbers(number_list, base_url):
    """
    生成URL列表,将数字替换到URL中的指定位置

    :param number_list: 包含要替换的数字的列表
    :param base_url: 基础URL字符串,其中tid=后面的数字会被替换
    :return: URL列表
    """
    url_list = []
    for number in number_list:
        # 使用字符串的格式化功能,将数字插入到URL中的指定位置
        url = base_url.format(tid=number)
        url_list.append(url)
    return url_list

6.函数调用:运行是对于自己想要爬取的信息可将keyword和csv替换成自己的。

# 使用示例http://liuyan.people.com.cn/threads/content?tid=14505592&from=search
list_url = "http://liuyan.people.com.cn/messageSearch"
keyword = "老年人医疗"  # 替换为你想搜索的关键字
detail_links = get_detail_urls_by_keyword(keyword, list_url)
print(detail_links)
#for link in detail_links:
    #print(link)

base_url = "http://liuyan.people.com.cn/threads/content?tid={tid}&from=search"
urls = generate_urls_from_numbers(detail_links, base_url)
for url in urls:
    print(url)

m_user, m_subject, m_type, m_star, m_domain, m_state, m_time, m_message, m_reply = get_information(urls)




# Create a new CSV file or open an existing one for writing
with open('老年人医疗.csv', 'w', newline='') as csv_file:
    csv_writer = csv.writer(csv_file)

    # Write the header row
    csv_writer.writerow(['User', 'Subject', 'Type', 'Star', 'Domain', 'State', 'Time', 'Message', 'Reply'])

    # Get the maximum length among all lists
    max_len = max(len(m_user), len(m_subject), len(m_type), len(m_star), len(m_domain), len(m_state), len(m_time), len(m_message), len(m_reply))

    # Ensure all lists have the same length by padding with empty strings
    m_user.extend([''] * (max_len - len(m_user)))
    m_subject.extend([''] * (max_len - len(m_subject)))
    m_type.extend([''] * (max_len - len(m_type)))
    m_star.extend([''] * (max_len - len(m_star)))
    m_domain.extend([''] * (max_len - len(m_domain)))
    m_state.extend([''] * (max_len - len(m_state)))
    m_time.extend([''] * (max_len - len(m_time)))
    m_message.extend([''] * (max_len - len(m_message)))
    m_reply.extend([''] * (max_len - len(m_reply)))

    # Transpose the data to write column-wise
    data_to_write = [
        m_user,
        m_subject,
        m_type,
        m_star,
        m_domain,
        m_state,
        m_time,
        m_message,
        m_reply
    ]

    data_to_write = list(zip(*data_to_write))

    # Write the data to the CSV file
    csv_writer.writerows(data_to_write)

print("Data has been written to the CSV file 'example.csv'")

7.结果展示:

8.如有任何疑问欢迎评论,或者私聊!

你可能感兴趣的:(python数据分析,机器学习,爬虫,python)