有2种实现方案:
1、使用pyspider开源框架,安装好pyspider并启动后,默认是本地的5001端口,新建一个爬虫项目,写下如下python代码实践爬去知乎的问题和评论数据,同时使用python-mysql,把爬到的数据存到自己建的一个数据库,把数据留给自己使用分析哈哈!
2、使用urllib,PyQuery,requests,BeautifulSoup等库自己实现一个简单的爬虫,可以爬取图片下载下来,存到数据库,或者爬取文本
本文共4部分:
import requests
import re
from bs4 import BeautifulSoup
def most_simple_crawl():
# 最简单的爬虫
content = requests.get('http://www.qiushibaike.com').content
soup = BeautifulSoup(content, 'html.parser')
for div in soup.find_all('div', {'class': 'content'}):
print div.text.strip()
if __name__ == '__main__':
most_simple_crawl()
# coding=utf-8
import re
import urllib2
import urllib
def crawl_taobao():
# 淘宝上搜索的关键词
key = "比基尼"
key = urllib2.quote(key)
headers = ("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0")
opener = urllib2.build_opener()
opener.addheaders = [headers]
urllib2.install_opener(opener)
# 分页爬取
for i in range(0, 4):
url = "https://s.taobao.com/search?q=" + key
data = urllib2.urlopen(url).read().decode("utf-8", "ignore")
pat = 'pic_url":"//(.*?)"'
imagelist = re.compile(pat).findall(data)
# 爬取每一页中所有的图片
for j in range(0, len(imagelist)):
thisimg = imagelist[j]
thisimgurl = "http://" + thisimg
# 保存到自己电脑的D盘
savefile = 'D:/pic/' + str(i) + str(j) + '.jpg'
urllib.urlretrieve(thisimgurl, filename=savefile)
if __name__ == '__main__':
crawl_taobao()
from pyspider.libs.base_handler import *
import MySQLdb
import random
class Handler(BaseHandler):
crawl_config = {
'headers':{
'User-Agent':'GoogleBot',
'Host':'www.zhihu.com',
}
}
def __init__(self):
self.db = MySQLdb.connect('localhost', 'root', '123456', 'onlineq', charset='utf8')
# 把爬到的知乎问题存到自己建的数据库中
def add_question(self,title,content,comment_count):
try:
cursor=self.db.cursor()
sql = 'insert into question(title, content, user_id, created_date,comment_count)values ("%s","%s", %d,now(),%d)'%(title,content,random.randint(20,26),comment_count)
print sql
cursor.execute(sql)
qid= cursor.lastrowid
print qid
self.db.commit()
return qid
except Exception, e:
self.db.rollback()
return 0
# 把爬到的问题评论存到自己建的数据库中
def add_comment(self,qid, comment):
try:
cursor=self.db.cursor()
sql='insert into comment (content, entity_type, entity_id, user_id, created_date) values("%s",100,%d,%d ,now())' % (comment, qid,random.randint(20,26))
print sql
cursor.execute(sql)
self.db.commit()
except Exception, e:
print e
self.db.rollback()
@every(minutes=24 * 60)
def on_start(self):
self.crawl('https://www.zhihu.com/topic/19552330/top-answers', callback=self.index_page,validate_cert=False)
@config(age=10 * 24 * 60 * 60)
def index_page(self, response):
for each in response.doc('a[data-za-detail-view-element_name="Title"]').items():
self.crawl(each.attr.href, callback=self.detail_page,validate_cert=False)
@config(priority=2)
def detail_page(self, response):
title = response.doc('h1.QuestionHeader-title').text()
content = response.doc('span.RichText.ztext').html()
items = response.doc('span.RichText.ztext.CopyrightRichText-richText').items()
if content==None:
content = ''
content = content.replace('"','\\"')
qid=self.add_question(title,content,sum(1 for x in items))
for each in response.doc('span.RichText.ztext.CopyrightRichText-richText').items():
self.add_comment(qid,each.html().replace('"','\\"'))
return {
"url": response.url,
"title": response.doc('title').text(),
}
from pyspider.libs.base_handler import *
import random
import MySQLdb
class Handler(BaseHandler):
crawl_config = {
}
def __init__(self):
self.db = MySQLdb.connect('localhost', 'root', '123456', 'onlineq', charset='utf8')
def add_question(self,title,content):
try:
cursor=self.db.cursor()
sql = 'insert into question(title, content, user_id, created_date,comment_count)values ("%s","%s", %d,now(),0)'%(title,content,random.randint(20,22))
print sql
cursor.execute(sql)
print cursor.lastrowid
self.db.commit()
except Exception, e:
self.db.rollback()
@every(minutes=24 * 60)
def on_start(self):
self.crawl('https://www.v2ex.com/', callback=self.index_page,validate_cert=False)
@config(age=10 * 24 * 60 * 60)
def index_page(self, response):
for each in response.doc('a[href^="https://www.v2ex.com/?tab="]').items():
self.crawl(each.attr.href, callback=self.tab_page,validate_cert=False)
@config(priority=2)
def tab_page(self, response):
for each in response.doc('a[href^="https://www.v2ex.com/go/"]').items():
self.crawl(each.attr.href, callback=self.board_page,validate_cert=False)
@config(priority=2)
def board_page(self, response):
for each in response.doc('a[href^="https://www.v2ex.com/t/"]').items():
url=each.attr.href
if url.find('#reply')>0:
url=url[0:url.find('#')]
self.crawl(url, callback=self.detail_page,validate_cert=False)
for each in response.doc('a.page_normal').items():
self.crawl(each.attr.href, callback=self.board_page,validate_cert=False)
@config(priority=2)
def detail_page(self, response):
title = response.doc('h1').text()
content = response.doc('div.topic_content').html().replace('"','\\"')
self.add_question(title,content)
return {
"url": response.url,
"title": response.doc('title').text(),
}