python爬虫入门之————————————————案例演练

目标url  https://www.qiushibaike.com/8hr/page/

python爬虫入门之————————————————案例演练_第1张图片

数据持久化源代码:

"""
Version 1.1.0
Author lkk
Email [email protected]
date 2018-11-22 21:57
DESC sqlalchemy存储
"""

from sqlalchemy import Column, String, create_engine, Integer, Text
from sqlalchemy.orm import sessionmaker
import pymysql
pymysql.install_as_MySQLdb()
from sqlalchemy.ext.declarative import declarative_base

Base = declarative_base()

eng = create_engine('mysql+mysqlconnector://root:123456@localhost:3306/data', echo=True)
eng.echo = True
DBSession = sessionmaker(bind=eng)
session = DBSession()


class Joke(Base):
    __tablename__ = 'jokes'
    id = Column(Integer, primary_key=True, autoincrement=True)
    author = Column(String(20))
    content = Column(Text)
    number = Column(String(50))


# 创建从Base派生的所有表
def create_all(eng):
    Base.metadata.create_all(eng)


# 删除DB中所有的表
def drop_all(eng):
    Base.metadata.drop_all(eng)


if __name__ == '__main__':
    create_all(eng)



筛选目标数据源代码

"""
Version 1.1.0
Author lkk
Email [email protected]
date 2018-11-22 19:55
DESC 糗事百科数据采集
"""
from urllib import request
from bs4 import BeautifulSoup
import requests, chardet
from fake_useragent import UserAgent
import utils2
from utils2 import Joke, session


def get_html(url):
    ua = UserAgent()
    headers = {
        'User-agent': ua.random
    }
    html = requests.get(url, headers=headers).text
    return html


def get_info(html):
    soup = BeautifulSoup(html, 'lxml')
    user_list = (soup.select('h2'))
    content_list = (soup.select('div[class="content"] > span:nth-of-type(1)'))
    joke_list = (soup.select('.stats-vote'))
    # next_url = (find_all('.pagination li > a')[-1].attrs('href'))
    # print()
    for i in range(len(user_list)):
        username = user_list[i].text.strip()
        content = content_list[i].text.strip()
        joke = joke_list[i].text.strip()
        print(username, content, joke)
       
        try:
            info = Joke(author=username, content=content, number=joke)  # 数据持久化处理
            session.add(info)
            session.commit()
        except BaseException as e:
            print(e)
        finally:
            session.close()


next_page = 'https://www.qiushibaike.com/8hr/page/'
for j in range(1, 14):
    next_url = next_page + str(j)
    print(next_url)
    info = get_html(next_url)
    cont = get_info(info)

 

你可能感兴趣的:(个人学习,爬虫)