导航
- 1:项目目录结构
- 2:spiders里jobbole.py的代码内容
- 3:cmmon.py代码内容
- 4:items.py里的文件内容
- 5:main.py
- 6:pipelines.py
- 最后是setting.py文件里的信息
1:项目目录结构

2:spiders里jobbole.py的代码内容
import scrapy
from scrapy import Request
from urllib import parse
import requests
import re
import json
from spiderBaby.items import JobBoleArticleItem
from spiderBaby.utils import common
proxy='proxy-cn.toshiba.co.jp:8080'
proxies={
'http':'http://'+proxy,
'https':'https://'+proxy,
}
class JobboleSpider(scrapy.Spider):
name = 'jobbole'
allowed_domains = ['news.cnblogs.com']
start_urls = ['http://news.cnblogs.com/',]
def parse(self, response):
post_nodes = response.css('#news_list .news_block')[:1]
for post_nodes in post_nodes:
image_url = post_nodes.css(".entry_summary a img::attr(src)").extract_first("")
if image_url.startswith("//"):
image_url = "https:" + image_url
post_url = post_nodes.css("h2 a::attr(href)").extract_first("")
"""
路径拼接,urljoin(1,2) 2如果不是一个完整的路径就把1和2拼接成一个完整的路径.
如果是一个完整的就不拼接,直接用2作用路径
"""
yield Request(url="{}{}".format("http://news.cnblogs.com", post_url),
meta={"front_image_url":image_url}, callback=self.parse_detail)
def parse_detail(self,response):
"""
这里主要做三件事
1:从详情页获取到各个自己需要的字段(实际我并不需要)
2:把值加入到item, 键和items.py
3:又进一步请求了另一个url,是动态加载的相关页面,并把item值传到下一个解析函数.
"""
match_re = re.match(".*?(\d+)", response.url)
if match_re:
post_id = match_re.group(1)
article_item = JobBoleArticleItem()
news_title = response.css("#news_title a::text").extract_first("")
create_time = response.css("#news_info .time::text").extract_first("")
match_res = re.match(".*?(\d+.*)", create_time)
if match_res:
create_time = match_res.group(1)
news_content = response.css("#news_content #news_body").extract()
tag_list = response.css(".news_tags a::text").extract()
tag_str = ",".join(tag_list)
article_item["news_title"] = news_title
article_item["create_time"] = create_time
article_item["news_content"] = news_content
article_item["tag_str"] = tag_str
if response.meta.get("front_image_url", ""):
article_item["front_image_url"] = [response.meta.get("front_image_url", "")]
else:
article_item["front_image_url"] = []
article_item["url"] = response.url
yield Request(url="https://news.cnblogs.com/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id), dont_filter=True, meta = {"article_item":article_item}, callback=self.parse_nums)
def parse_nums(self, response):
"""
1:取出item对象
2:把动态加载的数据存到item对象中,并把不固定长度的url用md5解析成固定长度
3:返回item数据到items.py
"""
j_data = json.loads(response.text)
article_item = response.meta.get("article_item", "")
praise_nums = j_data["DiggCount"]
fav_nums = j_data["TotalView"]
comment_nums = j_data["CommentCount"]
article_item["praise_nums"] = praise_nums
article_item["fav_nums"] = fav_nums
article_item["comment_nums"] = comment_nums
article_item["url_object_id"] = common.get_md5(article_item["url"])
yield article_item
3:cmmon.py代码内容
import hashlib
def get_md5(url):
if isinstance(url, str):
url = url.encode("utf-8")
m = hashlib.md5()
m.update(url)
return m.hexdigest()
if __name__ == "__main__":
print(get_md5("https"))
4:items.py里的文件内容
import scrapy
class SpiderbabyItem(scrapy.Item):
pass
class JobBoleArticleItem(scrapy.Item):
news_title = scrapy.Field()
create_time = scrapy.Field()
url = scrapy.Field()
url_object_id = scrapy.Field()
front_image_url = scrapy.Field()
front_image_path = scrapy.Field()
praise_nums = scrapy.Field()
fav_nums = scrapy.Field()
comment_nums = scrapy.Field()
tag_str = scrapy.Field()
news_content = scrapy.Field()
5:main.py
from scrapy.cmdline import execute
import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
execute(["scrapy", "crawl", "jobbole"])
6:pipelines.py
from scrapy.pipelines.images import ImagesPipeline
import codecs
import json
import MySQLdb
class SpiderbabyPipeline:
def process_item(self, item, spider):
return item
class MysqlPipeline(object):
def __init__(self):
self.connect = MySQLdb.connect("127.0.0.1", 'root', 'rootJUST666', 'spider',charset='utf8', use_unicode=True)
self.cursor = self.connect.cursor()
def process_item(self, item, spider):
insert_sql = """
insert into spiders(content) values(%s)
"""
self.cursor.execute(insert_sql, (item.get("news_title", ""),))
self.connect.commit()
return item
class JsonWithEncodingPipeline(object):
def __init__(self):
self.file = codecs.open("article.txt", "a", encoding="utf-8")
def process_item(self, item, spider):
lines = json.dumps(dict(item), ensure_ascii=False)
self.file.write(lines)
return item
def spider_closed(self, spider):
self.file.close()
class ArticleImagePipeline(ImagesPipeline):
def item_completed(self, results, item, info):
if "front_image_url" in item:
for ok, value in results:
image_file_path = value["path"]
item["front_image_path"] = image_file_path
return item
最后是setting.py文件里的信息
import os
BOT_NAME = 'spiderBaby'
SPIDER_MODULES = ['spiderBaby.spiders']
NEWSPIDER_MODULE = 'spiderBaby.spiders'
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
'spiderBaby.pipelines.ArticleImagePipeline': 1,
'spiderBaby.pipelines.JsonWithEncodingPipeline':2,
'spiderBaby.pipelines.MysqlPipeline':3,
'spiderBaby.pipelines.SpiderbabyPipeline': 300,
}
IMAGES_URLS_FIELD = "front_image_url"
project_dir = os.path.dirname(os.path.abspath(__file__))
IMAGES_STORE = os.path.join(project_dir, 'images')