python爬虫之虎扑步行街主题帖

前言

python爬虫的盛行让数据变得不在是哪么的难以获取。现在呢,我们可以根据我们的需求去寻找我们需要的数据,我们下来就利用python来写一个虎扑步行街主题帖的基本信息,主要包括:帖子主题(title)、帖子链接(post_link)、作者(author)、作者链接(author_link)、创作时间(start_date)、回复数(reply)、浏览数(view)、最后回复用户(last_reply)和最后回复时间(date_time)。接下来我们需要把获取到的信息存到数据库里面进行保存,以便于我们来使用。

  1. 获取我们需要的帖子信息的定位:
    我们首先打开浏览器输入虎扑步行街的网址:虎扑步行街 ,然后点击右键“查看元素”,在里面我们就可以看到我们需要定位的数据:
    python爬虫之虎扑步行街主题帖_第1张图片
  2. 写出获取网站信息数据和分析出帖子数据list
import requests
from bs4 import BeautifulSoup
import datetime

#获取网站HTML数据
def get_page(link):
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64;x64;rv:59.0) Gecko/20100101 Firefox/59.0'}
    r = requests.get(link,headers=headers)
    html = r.content
    html = html.decode('UTF-8')
    soup = BeautifulSoup(html,'lxml')
    return soup

#对返回的HTML信息进行处理的到虎扑步行街list信息
def get_data(post_list):
    data_list=[]
    for post in post_list:
        title_id = post.find('div',class_='titlelink box')
        title = title_id.a.text.strip()
        post_link = title_id.a['href']
        post_link = 'https://bbs.hupu.com'+post_link
        #作者、作者链接、创建时间
        author_div = post.find('div',class_='author box')
        author = author_div.find('a',class_='aulink').text.strip()
        author_page = author_div.find('a',class_='aulink')['href']
        start_data = author_div.select('a:nth-of-type(2)')[0].get_text()
        #回复人数和浏览次数
        reply_view = post.find('span',class_='ansour box').text.strip()
        reply = reply_view.split('/')[0].strip()
        view = reply_view.split('/')[1].strip()
        #最后回复用户和最后回复时间
        reply_div = post.find('div',class_='endreply box')
        reply_time = reply_div.a.text.strip()
        last_reply = reply_div.find('span',class_='endauthor').text.strip()
        if ':' in reply_time:
            date_time = str(datetime.date.today())+' '+ reply_time
            date_time = datetime.datetime.strptime(date_time,'%Y-%m-%d %H:%M')
        else:
            date_time = datetime.datetime.strptime(reply_time,'%Y-%m-%d').date()
        data_list.append([title,post_link,author,author_page,start_data,reply,view,last_reply,date_time])
    return data_list

 link = 'http://bbs.hupu.com/bxj-'+str(i)
    print('开始第%s页数据爬取...' %i)
    soup = get_page(link)
    soup = soup.find('div',class_='show-list')
    post_list = soup.find_all('li')
    data_list = get_data(post_list)
    hupu_post = MysqlAPI('localhost','root','mysql','scraping','utf8')
    for each in data_list:
     print(each)    
    print('第%s页爬取完成!' %i)
    time.sleep(3)

这段代码完成的是对我们分析获取到数据之后进行打印输出到控制台
3.存入到MySQL数据库中
首先我们需要进行数据库处理进行类的封装。之后数据库的连接需要我们提供数据库ip地址、用户名、密码、数据库名和字符型;其次,编写插入数据库的sql语句以及需要传入参数的设置。

import MySQLdb
class MysqlAPI(object):
    def __init__(self,db_ip,db_name,db_password,table_name,db_charset):
        self.db_ip = db_ip
        self.db_name = db_name
        self.db_password = db_password
        self.table_name = table_name
        self.db_charset = db_charset
        self.conn = MySQLdb.connect(host=self.db_ip,user=self.db_name,
                                    password=self.db_password,db=self.table_name,charset=self.db_charset)
        self.cur = self.conn.cursor()
    def add(self,title,post_link,author,author_page,start_data,reply,view,last_reply,date_time):
        sql = "INSERT INTO hupu(title,post_link,author,author_page,start_data,reply,view,last_reply,date_time)"\
                                   "VALUES (\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\')"

        self.cur.execute(sql %(title,post_link,author,author_page,start_data,reply,view,last_reply,date_time))
        self.conn.commit()

*注意:在执行完sql语句之后必须需要self.conn,commit(),不然就会出现存不进去数据问题,这个问题困扰了我好久。在sql语句的编写中我们很容易编写错误,所以用的引号劲量用转义字符进行转义,这样防止出现一些不必要的错误。


附录:

import requests
from bs4 import BeautifulSoup
import datetime
import MySQLdb
import time

def get_page(link):
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64;x64;rv:59.0) Gecko/20100101 Firefox/59.0'}
    r = requests.get(link,headers=headers)
    html = r.content
    html = html.decode('UTF-8')
    soup = BeautifulSoup(html,'lxml')
    return soup

def get_data(post_list):
    data_list=[]
    for post in post_list:
        title_id = post.find('div',class_='titlelink box')
        title = title_id.a.text.strip()
        post_link = title_id.a['href']
        post_link = 'https://bbs.hupu.com'+post_link

        #作者、作者链接、创建时间
        author_div = post.find('div',class_='author box')
        author = author_div.find('a',class_='aulink').text.strip()
        author_page = author_div.find('a',class_='aulink')['href']
        start_data = author_div.select('a:nth-of-type(2)')[0].get_text()
        #回复人数和浏览次数
        reply_view = post.find('span',class_='ansour box').text.strip()
        reply = reply_view.split('/')[0].strip()
        view = reply_view.split('/')[1].strip()
        #最后回复用户和最后回复时间
        reply_div = post.find('div',class_='endreply box')
        reply_time = reply_div.a.text.strip()
        last_reply = reply_div.find('span',class_='endauthor').text.strip()
        if ':' in reply_time:
            date_time = str(datetime.date.today())+' '+ reply_time
            date_time = datetime.datetime.strptime(date_time,'%Y-%m-%d %H:%M')
        else:
            date_time = datetime.datetime.strptime(reply_time,'%Y-%m-%d').date()
        data_list.append([title,post_link,author,author_page,start_data,reply,view,last_reply,date_time])
    return data_list

class MysqlAPI(object):
    def __init__(self,db_ip,db_name,db_password,table_name,db_charset):
        self.db_ip = db_ip
        self.db_name = db_name
        self.db_password = db_password
        self.table_name = table_name
        self.db_charset = db_charset
        self.conn = MySQLdb.connect(host=self.db_ip,user=self.db_name,
                                    password=self.db_password,db=self.table_name,charset=self.db_charset)
        self.cur = self.conn.cursor()
    def add(self,title,post_link,author,author_page,start_data,reply,view,last_reply,date_time):
        sql = "INSERT INTO hupu(title,post_link,author,author_page,start_data,reply,view,last_reply,date_time)"\
                                   "VALUES (\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\')"

        self.cur.execute(sql %(title,post_link,author,author_page,start_data,reply,view,last_reply,date_time))
        self.conn.commit()

for i in range(1,3):
    link = 'http://bbs.hupu.com/bxj-'+str(i)
    print('开始第%s页数据爬取...' %i)
    soup = get_page(link)
    soup = soup.find('div',class_='show-list')
    post_list = soup.find_all('li')
    data_list = get_data(post_list)
    hupu_post = MysqlAPI('localhost','root','mysql','scraping','utf8')
    for each in data_list:
        hupu_post.add(each[0],each[1],each[2],each[3],each[4],each[5],each[6],each[7],each[8])
    print('第%s页爬取完成!' %i)
    time.sleep(3)

你可能感兴趣的:(python爬虫)