'
,re.S)
items = re.findall(pattern,response.body.decode(response.encoding))
print ("lin len: %d"%(len(items)))
for item in items:
print ("lin User: %s"%(item[0].strip()))
print ("lin Content: %s"%(item[1].strip()))
print ("lin God comments: %s"%(item[2].strip()))
myItems = MyItem(user=item[0], content=item[1], godComment=item[2])
yield myItems
5.设置/myproject/myproject/settings.py的headers
DEFAULT_REQUEST_HEADERS = {
'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
}
6.执行myspider,传入pageIndex=1,结果数据保存为items.json
scrapy crawl myspider -a pageIndex=1 -o items.json
7.结果输出和items.json出现Export Unicode字符集问题
8.Scrapy中关于Export Unicode字符集问题解决
(http://blog.csdn.net/peihaozhu/article/details/53022236)
8.1设置/myproject/myproject/settings.py
from scrapy.exporters import JsonLinesItemExporter
class CustomJsonLinesItemExporter(JsonLinesItemExporter):
def __init__(self, file, **kwargs):
super(CustomJsonLinesItemExporter, self).__init__(file, ensure_ascii=False, **kwargs)
#这里只需要将超类的ensure_ascii属性设置为False即可
#同时要在setting文件中启用新的Exporter类
FEED_EXPORTERS = {
'json': 'myproject.settings.CustomJsonLinesItemExporter',
}
8.2再次执行,解决items.json出现Export Unicode字符集的问题,items.json路径为\myproject