# import urllib.request # import http.cookiejar # import pymysql # conn = pymysql.connect("localhost", "root", "123456", "test") # cursor = conn.cursor() # cursor.execute("DROP TABLE IF EXISTS employee") # sql = """CREATE TABLE employee(first_name CHAR(20) NOT NULL, # last_name CHAR(20), # age INT, # sex CHAR(1))""" # cursor.execute(sql) # sqlInsert = """INSERT INTO employee(first_name,last_name,age,sex) VALUES('李白','白居易',20,'男')""" # try: # cursor.execute(sqlInsert) # cursor.execute(sqlInsert) # conn.commit() # except: # conn.rollback() # conn.close() # 爬虫 import requests from bs4 import BeautifulSoup import pymysql # 本地数据库 sql_host = 'localhost' # 数据库的用户名 sql_user = 'root' # 数据库密码 sql_password = '123456' # 数据的名 sql_name = 'test' SQL_INSERT = """INSERT INTO user_data(author,page,sex,age,vote) VALUES(%s,%s,%s,%s,%s)""" def download_page(http_url): headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0"} call_back = requests.get(http_url, headers=headers) return call_back.text def get_page_content(html, page): conn = pymysql.connect(sql_host, sql_user, sql_password, sql_name) cursor = conn.cursor() output = """第{}页 作者:{} 性别:{} 年龄:{} 点赞:{} 评论:{}\n{}\n------------\n""" # 最终输出格式 soup = BeautifulSoup(html, 'html.parser') con = soup.find(id='content-left') con_list = con.find_all('div', class_='article') for item in con_list: author = item.find('h2').string content = item.find('div', class_='content').find('span').get_text() stats = item.find('div', class_='stats') vote = stats.find('span', class_='stats-vote').find('i', class_='number').get_text() comments = stats.find('span', class_='stats-comments').find('i', class_='number').string author_info = item.find('div', class_='articleGender') if author_info is not None: class_list = author_info['class'] age = author_info.string if 'womenIcon' in class_list: gender = '女' elif 'manIcon' in class_list: gender = '男' else: gender = '' else: gender = '' age = '' print(author, page, gender, age, vote, content) # cursor.execute(SQL_INSERT, ("name","data","gg","sd","dd")) cursor.execute("""INSERT INTO user_data(author,page,sex,age,vote) VALUES("name","data","gg","sd","dd")""") conn.commit() # conn.close() def main(): for i in range(1, 2): http_url = 'https://qiushibaike.com/text/page/{}'.format(i) html = download_page(http_url) get_page_content(html,i) if __name__ == '__main__': main()