1.1 新建scrapy项目:
在D:\learning_code_scrapy (自定义)文件夹目录中执行cmd打开终端 终端执行:scrapy startproject Test_Monday_job51 使用pycharm打开文件:Test_Monday_job51(使用新窗口打开)
1.2 编写scrapy项目:
◆ 编辑 items.py 文件 import scrapy from scrapy import Item,Field class TestMondayItem(scrapy.Item): jobName = Field() # 工作名称 ComName = Field() # 公司名称 adress = Field() # 工作地点 money = Field() # 薪资 releaseTime = Field() # 发布时间 pass ◆ 在spiders 文件夹下 创建 Get_Data_job51.py 文件并编辑 from scrapy.selector import Selector from scrapy.spiders import CrawlSpider from Test_Monday_job51.items import TestMondayItem class Get_Data_job51(CrawlSpider): pageNum = 1 #初始化页面 name = "Get_Data_job51" #与文件名同名 start_urls = ['https://search.51job.com/list/020000,000000,0000,00,9,99,%25E4%25\ BA%25BA%25E5%25B7%25A5%25E6%2599%25BA%25E8%2583%25BD,2,1.html?lang=c&stype=\ 1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&\companysize=\ 99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=4&dibiaoid\ =0&address=&line=&specialarea=00&from=&welfare='] def parse(self, response): Get_Data_job51.pageNum += 1 #获取下一页 selector = Selector(response) item = TestMondayItem() Infos = selector.xpath('//div[@id="resultList"][1]//div[@class="el"]') print(len(Infos)) for each in Infos: jobName = each.xpath('p/span/a/@title').extract() ComName = each.xpath('span[@class="t2"]/a/@title').extract() adress = each.xpath('span[2]/text()').extract() money = each.xpath('span[3]/text()').extract() releaseTime = each.xpath('span[4]/text()').extract() print(jobName,"\n",ComName,"\n",adress,"\n",money,"\n",releaseTime) item['jobName'] = jobName item['ComName'] = ComName item['adress'] = adress item['money'] = money item['releaseTime'] = releaseTime yield item #提交 item nextlink = selector.xpath('//div[@id="resultList"][1]//li[@class="bk"][2]/a/@href').extract()[0] if Get_Data_job51.pageNum<5 and nextlink: yield Request(nextlink,callback=self.parse) ◆ 在Test_Monday_job51文件夹下 创建 main.py 文件(与items文件同级)并编辑 from scrapy import cmdline cmdline.execute("scrapy crawl Get_Data_job51".split()) ◆ 编辑 pipline.py 文件(先将默认类注释) class json_TestMondayPipeline(object): #保存为json文件格式 def __init__(self): #打开或新建文件 self.file = open('json_51job.json','w',encoding='utf-8') def process_item(self,item,spider): #写入item 数据 line = json.dumps(dict(item),ensure_ascii=False)+"\n" #处理行数据 self.file.write(line) return item def close_spider(self,spider): self.file.close() ----------------------------------------------------------------------------------------------- class Excel_TestMondayPipeline(object): #保存为 Excel 文件格式 index = 0 def __init__(self): self.wk = xlwt.Workbook(encoding='utf-8') #打开或新建文件 self.sheet = self.wk.add_sheet('51job') field = ['职位名','公司名','工作地址','薪资','发布时间',] for i ,v in enumerate(field): self.sheet.write(0,i,v) def process_item(self,item,spider): #写入item 数据 Excel_TestMondayPipeline.index += 1 for j, v in enumerate(item.keys()): self.sheet.write(Excel_TestMondayPipeline.index, j, item[v]) return item def close_spider(self,spider): self.wk.save('51job.xls') # 保存文件 ◆ 设置 settings.py ( 找到 ITEM_PIPELINES 并编辑) ITEM_PIPELINES = { # 'Test_Monday_job51.pipelines.json_TestMondayPipeline': 300, #保存文 json 文件 'Test_Monday_job51.pipelines.Excel_TestMondayPipeline': 300, #保存为 excel 文件 } #释放 DOWNLOAD_DELAY DOWNLOAD_DELAY = 3 #延时 3 秒
Scrapy结合 CSS+xpath
win+r 打开cmd 执行:scrapy startproject Music
◆ 在 spiders 目录下创建Get_Music.py文件 from scrapy.spiders import CrawlSpider from scrapy.selector import Selector from Music.items import MusicItem import re class Music_menghua(CrawlSpider): name = "Get_Music" start_urls = ["http://xyq.163.com/download/down_music.html"] def parse(self, response): item = MusicItem() selector = Selector(response) Music_List = selector.css('#dLeft .g-clr section')[0].css('tbody tr') for tr in Music_List: music_name = tr.xpath('td[1]/text()')[0].extract() time_range = tr.xpath('td[2]/text()')[0].extract() music_link = tr.xpath('td[3]/a/@href').extract() print('3333333333333333333333333333', time_range) # 给item赋值 item['music_name'] = music_name # 歌名 item['time_range'] = time_range # 歌曲大小 item['music_link'] = music_link # 链接 yield item ◆ 编辑 items.py文件 import scrapy from scrapy import Field,Item class MusicItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() music_name = Field() time_range = Field() music_link = Field() pass ◆ 编辑 settings.py 文件 BOT_NAME = 'Music' SPIDER_MODULES = ['Music.spiders'] NEWSPIDER_MODULE = 'Music.spiders' USER_AGENT =['Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.54 Safari/536.5'] FEED_URI = u'file:///D:/learning_code_scrapy/Music.csv' #文件保存路径 FEED_FORMAT = 'CSV' ◆ 在spiders新建main.py(主函数),并编辑。--与items.py为同级文件 from scrapy import cmdline cmdline.execute("scrapy crawl Get_Music".split())