在终端输入scrapy startproject money163
,会自动生成一个同名的子目录和一个scrapy.cfg配置文件
有两个init文件都是空白的,暂时不用管,将经历放在items.py、settings.py、pipelines.py和将要在spiders子目录下生成的爬虫程序上
基本结构建立起来之后,需要按照说明的步骤一次完成对内容抽取,爬虫目标和行为以及数据操作的定义,每一个定义都对应一个文件。
# encoding: utf-8
import scrapy
import re
from scrapy.selector import Selector
from stock163.items import Stock163Item
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class ExampleSpider(CrawlSpider):
name = "stocknews" #爬虫的名字为 "stocknews"
allowed_domains = ["money.163.com"]#设置允许爬取的域名
def __init__(self, id='600000', page='0', *args, **kwargs):#初始化方法,设置了一些初始参数,包括 id(默认为 '600000')、page(默认为 '0'),以及其他可能传递的参数。
# allowrule = "/%s/%s\d+/\d+/*" % (year, month)
# allowrule = "/%s/%s%s/\d+/*" % (year, month, day) #这个规则匹配类似 "/2022/11/25/" 这样的日期结构
allowrule = r"/\d+/\d+/\d+/*"# 定义了一个正则表达式,用于匹配新闻链接的规则。数字 数字 数字 任意字符
self.counter = 0 # 初始化一个计数器,可能用于跟踪爬取的新闻数量。
self.stock_id = id # 保存股票ID
self.start_urls = ['http://quotes.money.163.com/f10/gsxw_%s,%s.html' % (id, page)] # 设置初始爬取的URL,这里使用了 id 和 page 参数构造URL。
ExampleSpider.rules = (Rule(LinkExtractor(allow=allowrule), callback="parse_news", follow=False),)
# 定义了爬取规则。这里使用了 LinkExtractor 来提取链接,通过正则表达式 allow=allowrule 匹配链接规则,然后指定了回调函数为 parse_news
# 最后设置 follow=False 表示不跟踪从当前链接提取的链接。
# recompile the rule
super(ExampleSpider, self).__init__(*args, **kwargs)
# 调用父类(CrawlSpider)的初始化方法,确保爬虫的正确初始化。
'''
rules=Rule(LinkExtractor(allow=r"/\d+/\d+/\d+/*"),
callback="parse_news", follow=True
)
'''
# f = open("out.txt", "w")
def printcn(suni):
for i in suni:
print(suni.encode('utf-8'))
def parse_news(self, response):
item = Stock163Item()
item['news_thread'] = response.url.strip().split('/')[-1][:-5]
#这行代码从响应的URL中提取新闻线程信息。它首先通过response.url获取当前页面的URL,然后使用strip()方法去除首尾的空格,接着使用split('/')方法根据斜杠切割URL为一个列表,最后通过[-1]
#取列表的最后一个元素,即URL中最后一个斜杠后的部分。[: -5] 是为了去掉文件扩展名(假设是.html或类似的扩展名),剩下的部分就是新闻线程的信息,然后将其赋值给item对象的news_thread属性。
self.get_thread(response,item)
self.get_title(response, item)
self.get_source(response, item)
self.get_url(response, item)
self.get_news_from(response, item)
self.get_from_url(response, item)
self.get_text(response, item)
return item ##############!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!remenber to Retrun Item after parse
def get_title(self, response, item):
title = response.xpath("/html/head/title/text()").extract()
if title:
# print ('title:'+title[0][:-5].encode('utf-8'))
item['news_title'] = title[0][:-5]
def get_source(self, response, item):
source = response.xpath("//div[@class='left']/text()").extract()
if source:
# print ('source'+source[0][:-5].encode('utf-8'))
item['news_time'] = source[0][:-5]
def get_news_from(self, response, item):
news_from = response.xpath("//div[@class='left']/a/text()").extract()
if news_from:
# print 'from'+news_from[0].encode('utf-8')
item['news_from'] = news_from[0]
def get_from_url(self, response, item):
from_url = response.xpath("//div[@class='left']/a/@href").extract()
if from_url:
# print ('url'+from_url[0].encode('utf-8') )
item['from_url'] = from_url[0]
def get_text(self, response, item):
news_body = response.xpath("//div[@id='endText']/p/text()").extract()
if news_body:
# for entry in news_body:
# print (entry.encode('utf-8'))
item['news_body'] = news_body
def get_url(self, response, item):
news_url = response.url
if news_url:
print(news_url)
item['news_url'] = news_url
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
#encoding: utf-8
import os
def ParseFilePath(url, id):
# user should change this folder path
outfolder = "e:\\data\\FinTech\\News\\Stocks\\%s" % id
components = url.split("/")
year = components[3]
monthday=components[4]
month = monthday[:2]
day = monthday[2:]
idx=components[5]
page=idx+"_"+components[6]
#folder = outfolder + "\\%s_%s_%s_" % (year, month, day)
folder = outfolder
if ((year=='') | ('keywords' in page)):
filepath='xxx'
else:
filepath = folder + "\\%s_%s_%s_%s.txt" % (year, month, day, page)
filepath=filepath.replace('?', '_')
return(folder, filepath)
class Stock163Pipeline(object):
def process_item(self, item, spider):
if spider.name != "stocknews": return item
if item.get("news_thread", None) is None: return item
url = item['news_url']
if 'keywords' in url:
return item
folder, filepath = ParseFilePath(url, spider.stock_id)
spider.counter = spider.counter+1
counterfilepath = folder+"\\counter.txt"
#one a single machine will is virtually no risk of race-condition
if not os.path.exists(folder):
os.makedirs(folder)
#print(filepath, counterfilepath)
#print(spider.stats)
fo = open(counterfilepath, "w", encoding="UTF-8")
fo.write(str(spider.counter))
fo.close()
if (filepath!='xxx'):
fo = open(filepath, 'w', encoding='utf-8')
fo.write(str(dict(item)))
fo.close()
return None
`class ExampleSpider(CrawlSpider):
name = "stocknews"
def __init__(self, id='600000', page='0', *args, **kwargs):
#allowrule = "/%s/%s\d+/\d+/*" % (year, month)
allowrule = "/%s/%s%s/\d+/*" % (year, month, day)
self.counter = 0
self.stock_id = id
self.start_urls = ['http://\%s' \% (site)]
ExampleSpider.rules=(Rule(LinkExtractor(allow=allowrule), callback="parse_news", follow=False),)
#recompile the rule `
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
process = CrawlerProcess(get_project_settings())
process.crawl('stocknews', id=stockid, page=str(page))
for site in ['money.163.com', 'tech.163.com', 'money.163.com/stock']:
process.crawl('myspider', site = site)
process.start()
from keras.applications.vgg16 import VGG16
from keras.layers import Input,Flatten,Dense,Dropout
from keras.models import Model
from keras.optimizers import SGD
from keras.datasets import mnist
import cv2
import h5py as h5py
import numpy as np
model_vgg = VGG16(include_top=False,weights='imagenet',input_shape=(ishape,ishape,3))
model = Flatten(name='flatten')(model_vgg.output)
model = Dense(4096,activation='relu',name='fc1')(model)
model = Dense(4096,activation='relu',name='fc2')(model)
model = Dropout(0.5)(model)
model = Dense(10,activation='softmax')(model)
model_vgg_mnist = Model(model_vgg.input,model,name='vgg16')
model_vgg_mnist.summary()
model_vgg = VGG16(include_top=False,weights='imagenet',input_shape=(224,224,3))
for layer in model_vgg.layers:
layer.trainable=False
model = Flatten()(model_vgg.output)
model = Dense(4096,activation='relu',name='fc1')(model)
model = Dense(4096,activation='relu',name='fc2')(model)
model = Dropout(0.5)(model)
model = Dense(10,activation='softmax',name='prediction')(model)
model_vgg_mnist_pretrain = Model(model_vgg.input,model,name='vgg16_pretrain')
model_vgg_mnist_pretrain.summary()
sgd = SGD(lr = 0.05,decay=1e-5)
model_vgg_mnist_pretrain.compile(loss='categorical_crossentropy',optimizer=sgd,metrics=['accuracy'])
(x_train,y_train),(x_test,y_test) = mnist.load_data()
x_train = [cv2.cvtColor(cv2.resize(i,(ishape,ishape)),cv2.COLOR_GRAY2BGR) for i in x_train]
x_train = np.concatenate([arr[np.newaxis] for arr in x_train]).astype('float32')
x_test = [cv2.cvtColor(cv2.resize(i,(ishape,ishape)),cv2.COLOR_GRAY2BGR) for i in x_test]
x_test = np.concatenate([arr[np.newaxis] for arr in x_test]).astype('float32')
x_test.shape
x_train.shape
x_train /= 255
x_test /= 255
np.where(x_train[0]!=0)
def tran_y(y):
y_ohe = np.zeros(10)
y_ohe[y] = 1
return y_ohe
y_train_ohe = np.array([tran_y(y_train[i]) for i in range(len(y_train))])
y_test_ohe = np.array([tran_y(y_test[i]) for i in range(len(y_test))])
model_vgg_mnist_pretrain.fit(x_train,y_train_ohe,validation_data=(x_test,y_test_ohe),epochs=200,batch_size=128)