开发环境为 Python3.6 ,爬虫项目全部内容索引目录
看懂Python爬虫框架,所见即所得一切皆有可能
本文介绍基于最简单的操作流程保存Python网络爬虫抓取的数据,通过2个完整的代码示例来介绍爬虫抓取的数据是如何保存到数据仓库中。
虽说不会敲代码的 Python数据分析师 不是好的数据分析师,但你不是正儿八经的开发人员,代码敲的那么溜有什么用?学点数据爬虫基础能让繁琐的数据CV工作(Ctrl+C,Ctrl+V)成为自动化就足够了。
#coding=utf-8
import urllib
import urllib.request
import pymysql
import time
import requests
import datetime
import pandas as pd
from bs4 import BeautifulSoup
mysql_engine = {
"host":"localhost",
"database":"sampledb",
"user":"root",
"password":"admin",
"charset":"utf8"}
#创建数据库
def creat_database_sampledb():
config_root = {
"host": "localhost",
"user": "root",
"password": "admin"}
sql = "Create Database If Not Exists sampledb CHARSET=utf8"
conn = pymysql.connect(**config_root) # 打开数据库连接
try:
with conn.cursor() as cursor: # 使用cursor()方法获取操作游标,并在语句结束自动关闭
cursor.execute(sql) # 执行SQL
conn.commit() # 提交
finally:
conn.close()
#创建新闻列表
def createNewsTable():
createTbSql = (
"Create Table If Not Exists News( "
"id int primary key auto_increment, "
"title varchar(100), "
"url varchar(100), "
"date date)")
try:
corsor = pymysql.connect(**mysql_engine)
with corsor.cursor() as cursor:
cursor.execute(createTbSql)
corsor.commit()
finally:
corsor.close()
return None
#创建新闻内容表
def createNewsBody():
createTbNews = (
"Create Table If Not Exists NewsBody( "
"id INT PRIMARY KEY, "
"text text, "
"FOREIGN KEY(id) REFERENCES News(id))")
try:
corsor = pymysql.connect(**mysql_engine)
with corsor.cursor() as cursor:
cursor.execute(createTbNews)
corsor.commit()
finally:
corsor.close()
return None
#开始爬取数据
def start_crawler():
page_num = 1
while page_num<=2:
url = "http://www.cctd.com.cn/list-107-{}.html".format(page_num)
print (url)
page_num += 1
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = {
'User-Agent' : user_agent }
req = urllib.request.Request(url,headers=headers)
response = urllib.request.urlopen(req)
content = response.read().decode('gbk')
one_page = get_page_news(content)
time.sleep(1)
if one_page:
to_mysql(one_page)
time.sleep(1)
else:
break
print ('新闻抓取完毕')
#写入数据库
def to_mysql(one_page):
print (one_page)
sql = "insert into News(id,title,url,date) values(Null,%s,%s,%s)"
conn = pymysql.connect(**mysql_engine)
try:
with conn.cursor() as cursor:
cursor.executemany(sql, one_page)
conn.commit()
finally:
conn.close()
def get_page_news(content):
soup = BeautifulSoup(content,'lxml')
one_page = []
lptable = soup.find('table',id='new_table')
for i in lptable.find_all('tr'):
if i.a['href'] in get_exist(i.find('td',width='20%').string):
continue
one_page.append(
(i.a.string,
i.a['href'],
i.find('td',width='20%').string))
return one_page
def get_exist(date):
sql = "select url from News where date='{}'".format(date)
conn = pymysql.connect(**mysql_engine)
try:
with conn.cursor() as cursor:
cursor.execute(sql)
conn.commit()
result = cursor.fetchall()
finally:
conn.close()
return set(*zip(*result))
#抓取具体内容
def get_new_body():
link_list = get_news_linksfrom_database()
for id,url in link_list:
news_body = get_news_text(url)
#写入数据库
writer_news_body_to_database(id, news_body)
print("新闻主体完毕!")
def get_news_linksfrom_database():
pymysql_select_newslink = """
select News.id,News.url from News left join Newsbody
on News.id = Newsbody.id where Newsbody.id is null;
"""
conn = pymysql.connect(**mysql_engine)
try:
with conn.cursor() as cursor:
cursor.execute(pymysql_select_newslink)
conn.commit()
result = cursor.fetchall()
finally:
conn.close()
return result if result else []
def get_news_text(url):
html = requests.get(url)
html.encoding = html.apparent_encoding
soup = BeautifulSoup(html.text,'html.parser')
try:
return soup.find('div',{
'id':'Zoom'}).text
except:
return None
def writer_news_body_to_database(id, news_body):
print("INFO: Writing News ID:{} To Database...".format(id))
pymysql_writer_to_table = """
insert into Newsbody(id,text) values(%s,%s)
"""
conn = pymysql.connect(**mysql_engine)
try:
with conn.cursor() as cursor:
cursor.execute(pymysql_writer_to_table,(id,news_body))
conn.commit()
finally:
conn.close()
if __name__ == '__main__':
creat_database_sampledb()#创建数据库
createNewsTable()#创建新闻表
createNewsBody()#创建新闻详情表
'''爬取新闻简要'''
start_crawler() #开始爬虫
'''爬取具体新闻'''
get_new_body()
#coding=utf-8
import urllib
import urllib.request
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
import pymongo
# 获取mongoClient对象
client = pymongo.MongoClient("localhost", 27017)
# 获取使用的database对象
db = client.news
#开始爬取数据
def start_crawler():
page_num = 1
while page_num<=1:
url = "http://www.cctd.com.cn/list-13-{}.html".format(page_num)
print (url)
page_num += 1
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = {
'User-Agent' : user_agent }
req = urllib.request.Request(url,headers=headers)
response=requests.get(url,headers=headers)
content=response.text
one_page = get_page_news(content)
time.sleep(1)
if one_page:
to_mysql(one_page)
time.sleep(1)
else:
break
print ('新闻标题抓取完毕')
#爬取新闻标题、日期、地址到数据库
def to_mysql(one_page):
print (one_page)
def get_page_news(content):
soup = BeautifulSoup(content,'lxml')
one_page_list = []
lptable = soup.find('table',id='new_table')
for i in lptable.find_all('tr'):
title = i.a.string
url = i.a['href']
date = i.find('td',width='20%').string
one_page={
'title':title,'url':url,'date':date,'type':'analysis','label':'www.cctd.com.cn'}
db.news.insert_one(one_page)
one_page_list.append((title,url,date))
return one_page_list
#抓取具体内容
def get_new_body():
link_list = get_news_linksfrom_database()
for url in link_list['url']:
news_body = get_news_text(url)
#写入数据库
db.news.update({
'url':url},{
"$set":{
'newsbody':news_body}})
print("新闻主体爬取完毕!")
def get_news_linksfrom_database():
result=[]
for item in db.news.find({
'label':'www.cctd.com.cn','type':'analysis'},{
'url':1,'_id':1}):
result.append(item)
result = pd.DataFrame(result, columns=['url','_id'])
return result
def get_news_text(url):
html = requests.get(url)
html.encoding = html.apparent_encoding
soup = BeautifulSoup(html.text,'html.parser')
try:
newsBody=str(soup.find('div',{
'id':'Zoom'}))
print ("记录爬取完毕")
return newsBody
except:
print ("error")
return None
if __name__ == '__main__':
'''爬取新闻简要'''
start_crawler()
'''爬取具体新闻'''
get_new_body()