#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2018-01-08 10:21:36
# Project: newv2ex
from pyspider.libs.base_handler import *
import pymysql
import random
class Handler(BaseHandler):
crawl_config = {
}
def add_question(self,title,content):
db = pymysql.connect(host="localhost",user="root",password="root",db="club",charset="utf8")
try:
cursor = db.cursor()
#注意此处字符串的占位符要加双引号"%s"
sql = ' INSERT INTO question (title,content,user_id,created_date,comment_count) VALUES("%s","%s",%d,now(),%d)' % (title,content,random.randint(1,10),2)
print(sql)
cursor.execute(sql)
print(cursor.lastrowid)
db.commit()
except:
db.rollback()
@every(minutes=24 * 60)
def on_start(self):
self.crawl('https://www.v2ex.com/', callback=self.index_page, validate_cert=False)
@config(age=10 * 24 * 60 * 60)
def index_page(self, response):
for each in response.doc('a[href^="https://www.v2ex.com/?tab="]').items():
self.crawl(each.attr.href, callback=self.tab_page, validate_cert=False)
@config(priority=2)
def tab_page(self, response):
for each in response.doc('a[href^="https://www.v2ex.com/go/"]').items():
self.crawl(each.attr.href, callback=self.board_page, validate_cert=False)
@config(priority=2)
def board_page(self, response):
for each in response.doc('a[href^="https://www.v2ex.com/t/"]').items():
url = each.attr.href
if url.find('#reply')>0:
url = url[0:url.find('#')]
self.crawl(each.attr.href, callback=self.detail_page, validate_cert=False)
for each in response.doc('a.page_normal').items():
self.crawl(each.attr.href, callback=self.board_page, validate_cert=False)
@config(priority=2)
def detail_page(self, response):
title = response.doc('h1').text()
content = response.doc('div.topic_content').text()
#insert into MySQL
self.add_question(title,content)
return {
"url": response.url,
"title": response.doc('h1').text(),
"content": response.doc('div.topic_content').text()
}
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2018-01-08 20:58:58
# Project: newzhihu
from pyspider.libs.base_handler import *
import pymysql
import random
class Handler(BaseHandler):
crawl_config = {
'headers': {
'User-Agent': 'GoogleBot',
}
}
def add_question(self,title,content,comment_count):
db = pymysql.connect(host="localhost",user="root",password="root",db="club",charset="utf8")
try:
cursor = db.cursor()
#注意此处字符串的占位符要加双引号"%s"
sql = ' INSERT INTO question (title,content,user_id,created_date,comment_count) VALUES("%s","%s",%d,now(),%d)' % (title,content,random.randint(1,10),comment_count)
print(sql)
cursor.execute(sql)
qid = cursor.lastrowid
db.commit()
print(qid)
return qid
except:
db.rollback()
return 0
def add_comment(self,qid,comment):
db = pymysql.connect(host="localhost",user="root",password="root",db="club",charset="utf8")
try:
cursor = db.cursor()
#注意此处字符串的占位符要加双引号"%s"
sql = 'insert into comment(content, entity_type, entity_id, user_id, created_date) values ("%s",%d,%d, %d,now())' % (comment, 1, qid, random.randint(1, 10));
print(sql)
cursor.execute(sql)
#qid = cursor.lastrowid
#print(qid)
db.commit()
except:
db.rollback()
@every(minutes=24 * 60)
def on_start(self):
self.crawl('https://www.zhihu.com/topic/19550517/top-answers', callback=self.index_page, validate_cert=False)
@config(age=10 * 24 * 60 * 60)
def index_page(self, response):
for each in response.doc('a.question_link').items():
self.crawl(each.attr.href, callback=self.detail_page, validate_cert=False)
for each in response.doc('div.zm-invite-pager span a').items():
self.crawl(each.attr.href, callback=self.index_page, validate_cert=False)
@config(priority=2)
def detail_page(self, response):
items = response.doc('span.RichText.CopyrightRichText-richText').items()
title = response.doc('h1.QuestionHeader-title').text()
content = response.doc('div.QuestionHeader-detail').text()
qid = self.add_question(title, content, sum(1 for x in items))
for each in response.doc('span.RichText.CopyrightRichText-richText').items():
self.add_comment(qid, each.text())
return {
"url": response.url,
"title": title,
"content": content,
}