将豆瓣排名前250爬取数据通过sqlite3存入数据库

#爬取豆瓣top250电影,并保存到数据库
import requests
from bs4 import BeautifulSoup
import sqlite3

def get_html(web_url):
    user_agent = 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166  Safari/535.19'
    headers = {'User-agent': user_agent}
    response = requests.get(web_url, headers=headers)
    res = BeautifulSoup(response.text,'html.parser')
    content = res.find('ol', attrs={'class': 'grid_view'})
    return content
def get_title(content):
    film = content.find_all('li')
    conn = sqlite3.connect('file.sqlite3')
    cursor = conn.cursor()
#    f = open("F:\\Pythontest1\\douban.txt", "a")
    for items in film:
        rank = items.find('div',attrs = {'class','pic'}).text
        title = items.find('span', attrs={'class', 'title'}).text
        rate = items.find('span', attrs={'class', 'rating_num'}).text
        autor = items.find('p').get_text()
        member = autor.replace(' ','').replace('\n','').replace('\xa0','').replace('\xee','').replace('\xf6','').replace('\u0161','').replace('\xf4','').replace('\xfb','').replace('\xe5','').replace('\u22ef','')
        quote = items.find('p',attrs = {'class','quote'})
        if quote is None:
            quote = '无评语'
        else:
            quote = quote.get_text().replace('\n','').replace('\u22ef','')
        sql = 'insert into filmtop250 values (?,?,?,?,?,?)'
        cursor.execute(sql,(rank,title,rate,autor,member,quote))
        conn.commit()
    cursor.close()
    conn.close()

if __name__ == '__main__':
    for i in range(0,250,25):
        web_url = 'https://movie.douban.com/top250?start={}&filter='
        web_url = web_url.format(i)
        # print(web_url)
        content = get_html(web_url)
        # print(content)
        titles = get_title(content)

1.注意,sqlite3中insert into的占位符为?。

2.首先用pycharm的terminal端创建数据库file.sqilte3,表filmtop250。

3.注意数据库关闭的位置,即cursor.close()和conn.close()的位置。

你可能感兴趣的:(爬虫)