crq_zcbk

python scrapy框架项目，管道文件代码以及几种文件的保存方式

保存为json格式的终端操作代码：

scrapy crawl 爬虫文件名 -o 随便起的文件名.json -s FEEN_EXPORT_ENCODING=utf-8

import scrapy
# 在同级文件夹路径下找到指定的文件items
# 所以要回到上级文件夹路径来找  ..回到上级路径
from ..items import MokoItem
class MokokoSpider(scrapy.Spider):
    name = 'mokoko'
    allowed_domains = ['moko.cc']
    #通常会修改start_rl
    start_urls = ['http://www.moko.cc/channels/post/153/1.html']

    def parse(self, response):
        # print(response.text)
        ul_list=response.xpath('//ul[@class="post small-post"]')
        print(ul_list)
        all_items=[]
        for ul in ul_list:
            # 初始化一个item对象
            item= MokoItem()
            # xpath对象获取内容都是列表
            # 返回的内容为：scrapy.selector
            # 如果对象类型为scrapy.selector  那么这个对象可以
            # 被继续迭代  也可以被xpath继续寻找里面的内容
            title=ul.xpath('.//div[@class="cover"]/@cover-text')
            # print(title)
            # print(type(title))
            # 将xpath对象转化为列表对象   [0]取出里面的元素
            title=title.extract()[0]
            # print(title)
            # 如果对象的类型为list  那么这个对象可以迭代
            # 但是不能再继续使用xpath
            # print(type(title))

            clicknum=ul.xpath('.//li[last()]/span/text()').extract()[0]

            imgsrc=ul.xpath('.//img/@src2').extract()[0]
            item['title']=title
            item['imgsrc'] = imgsrc
            item['clicknum'] = clicknum
            yield item

#项目需求是将斗鱼网页代码用json格式获取里面的图片链接下载并在终端保存为json格式


#项目需求是   将斗鱼网页代码用json格式获取里面的图片链接下载  并在终端保存为json格式


import scrapy
from  ..items import DoudouyuItem
import json
class TupianSpider(scrapy.Spider):
    name = 'tupian'
    allowed_domains = ['api.douyucdn.cn']

    start_urls = ['http://api.douyucdn.cn/api/v1/getverticalRoom?limit=20&offset=']

    def parse(self, response):

        jsobj = json.loads(response.text)
        for src in jsobj["data"]:
            item = DoudouyuItem()
            src = src["room_src"]
            # print(src)
            item['src']=[src]
            yield item

        # print('正在获取第一页')
            for x in range(0,500,20):
                url='http://api.douyucdn.cn/api/v1/getverticalRoom?limit=20&offset=' + str(x)
                yield scrapy.Request(url=url,callback=self.parse)





如果只下载图片只需要在settings.py里将下载管道解注释然后按照如下代码进行：

ITEM_PIPELINES = {
   # 'doudouyu.pipelines.DoudouyuPipeline': 300,
   #专门负责下载图片的管道
    'scrapy.pipelines.images.ImagesPipeline':1
}
IMAGES_STORE='tutuqian'
IMAGES_URLS_FIELD='src'

项目需求将http://pic.netbian.com/4kmeishi/中的图片下载下来

项目需求  将http://pic.netbian.com/4kmeishi/中的图片下载下来





# -*- coding: utf-8 -*-
import scrapy

from  ..items import ImagenetItem
class ImageSpider(scrapy.Spider):
    name = 'image'
    allowed_domains = ['pic.netbian.com']
    start_urls = ['http://pic.netbian.com/4kmeishi/']

    def parse(self, response):
        img_list = response.xpath('//ul[@class="clearfix"]/li/a/img/@src')
        print(img_list)
        for img in img_list:
            item=ImagenetItem()
            src = 'http://pic.netbian.com/' + img.extract()
            # print(src)
            item['src']=[src]
            yield item
        next_url=response.xpath('//div[@class="page"]/a[text()="下一页"]/@href').extract()
        if len(next_url)!=0:

            print('*****************************')
            url='http://pic.netbian.com/'+next_url[0]
            yield scrapy.Request(url=url,callback=self.parse)
            # print(next_url)



ITEM_PIPELINES = {
   # 'imagenet.pipelines.ImagenetPipeline': 300,

#ImagesPipeline要与管道中的方法名字一样

    'scrapy.pipelines.images.ImagesPipeline':1
}
#图片的存储路径
IMAGES_STORE='imagessss'
IMAGES_URLS_FIELD='src'

存储为表格

scrapy crawl 爬虫名 -o 爬虫名.csv

存储为Excel

scrapy crawl 爬虫名 -o 爬虫名.xml

存储为json并且转码为中文

scrapy crawl 爬虫名 -o 爬虫名.json -s FEED_EXPORT_ENCODING=utf-8

保存TXT文本

项目需求  将贴吧中楼主的内容全部保存为TXT文本




# -*- coding: utf-8 -*-
import scrapy
# from..items import XiaoshuospiderItem

class ZhigengniaoSpider(scrapy.Spider):
    name = 'zhigengniao'
    allowed_domains = ['tieba.baidu.com']
    start_urls = ['https://tieba.baidu.com/p/5815118868?pn=']

    def parse(self, response):
        info_list = response.xpath('//div[@class="l_post l_post_bright j_l_post clearfix  "]')
        for info in info_list:
            name_list = info.xpath('.//ul[@class="p_author"]/li/a/text()').extract()
            for name in name_list:
                if name == '乔深沉':
                    content_list = info.xpath('.//div[@class="p_content  "]/cc/div/text()')
                    for con in content_list:
                        # item = XiaoshuospiderItem()
                        con = con.extract()

                        # item['con'] = con
                        # yield item

                        with open('xiaoshuo.txt','a',encoding='utf-8')as f:
                            f.write(con)
                            f.write('\n')
        next_url = response.xpath('//li[@class="l_pager pager_theme_5 pb_list_pager"]/a[text()="下一页"]/@href').extract()
        if len(next_url) != 0:
            url = 'https://tieba.baidu.com' + next_url[0]

            yield scrapy.Request(url=url, callback=self.parse)

项目需求将小说和图片同时保存在本地文件

项目需求  将小说和图片同时保存在本地文件



# -*- coding: utf-8 -*-
import scrapy

from ..items import QishuItem
class XiaoshuoSpider(scrapy.Spider):
    name = 'xiaoshuo'
    allowed_domains = ['qisuu.la']
    start_urls = ['https://www.qisuu.la/']

    def parse(self, response):
        # print(response.text)
        #获取所有小说的类型
        type_list=response.xpath('//div[@class="nav"]/a/@href').extract()
        # print(type_list)
        #列表里面第一个是首页  将首页  去掉
        del type_list[0]
        #输出url
        # print(response.url)
        for type in type_list:
            #拼接每一个类型的url地址
            # 在这个方法里面  response.url 为start_url
            url=response.url+type[1:]
            print(url)
            yield  scrapy.Request(url=url,callback=self.get_content_with_type_url)
    #用来找到每一种类型对应的小说
    def get_content_with_type_url(self,response):

        # print(response.text)
        #找到类型中  第一页所有小说详情页链接地址
        book_list=response.xpath('//div[@class="listBox"]/ul/li/a/@href').extract()
        print('************************')
        # print(book_list)
        for book in book_list:
            # 在这个方法里面  response.url 为: https://www.qisuu.la/soft/sort0(x)/
            url='https://www.qisuu.la'+book
            yield scrapy.Request(url=url,callback=self.get_detail_with_book_url)
            #获取每一本书的内容详情
    def get_detail_with_book_url(self,response):#extract_first  ：转化成列表同时取出第一个元素
        item=QishuItem()
        #获取小说标题
        name=response.xpath('//div[@class="detail_right"]/h1/text()').extract_first('')

        info_list=response.xpath('//div[@class="detail_right"]/ul/li/text()').extract()
        #获取需要下载的小说图片地址
        imageurl=response.xpath('//div[@class="detail_pic"]/img/@src').extract_first('')
        imageurl='https://www.qisuu.la'+imageurl
        #获取小说的下载地址
        downloadurl=response.xpath('//div[@class="showDown"]/ul/li[3]/script').extract_first('').split(',')[1].strip("'")
        print(downloadurl)
        print(imageurl)
        item['imageurl']=[imageurl]
        item['downloadurl']=[downloadurl]
        item['name']=name
        clicknum=info_list[0]
        item['clicknum']=clicknum
        filesize = info_list[1]
        item['filesize'] = filesize
        booktype = info_list[2]
        item['booktype'] = booktype
        updatetime = info_list[3]
        item['updatetime'] = updatetime
        bookstatus = info_list[4]
        item['bookstatus'] = bookstatus
        bookauthor = info_list[5]
        item['bookauthor'] = bookauthor
        print('//////////////////////////////////////////')
        print(info_list)
        yield item










ITEM_PIPELINES = {
   'qishu.pipelines.QishuPipeline': 300,
    #图片下载管道
    'scrapy.pipelines.images.ImagesPipeline':1,
    #文件（文字）下载管道
    'scrapy.pipelines.files.FilesPipeline':2
}
IMAGES_STORE='file/image'
IMAGES_URLS_FIELD='imageurl'

FILES_STORE='file/book'
FILES_URLS_FIELD='downloadurl'

将json格式添加标准

项目需求    将文件进行相应的保存   然后将json格式补充完整


# -*- coding: utf-8 -*-
import scrapy

import re
from ..items import HongxiuxiuItem
class XiuxiuSpider(scrapy.Spider):
    name = 'xiuxiu'
    allowed_domains = ['hongxiu.com']
    start_urls = ['https://www.hongxiu.com/all?gender=2&catId=-1']

    def parse(self, response):
        type_list=response.xpath('//ul[@type="category"]/li/a/@href').extract()
        del type_list[0]
        for type in type_list:

            url='https://www.hongxiu.com'+type
            split=re.compile(r'.*?catId=(.*?)&.*?',re.S)
            catId=re.findall(split,url)
            print(catId)
            yield scrapy.Request(url=url,meta={'type':catId[0]},callback=self.get_content_with_type_url)
    def get_content_with_type_url(self,response):
        catId=response.meta['type']
        for page_num in range(1,11):
            url='https://www.hongxiu.com/all?pageNum='+str(page_num)+'&pageSize=10&gender=2&catId='+catId+'&isFinish=-1&isVip=-1&size=-1&updT=-1&orderBy=0'
            print(url)
            yield scrapy.Request(url=url,callback=self.get_book_with_url)
    def get_book_with_url(self,response):
        detail_list=response.xpath('//div[@class="book-info"]/h3/a/@href').extract()
        for book in detail_list:
            url='https://www.hongxiu.com'+book
            print('********************************************************')
            print(url)
            yield scrapy.Request(url=url,callback=self.get_detail_with_url)
    def get_detail_with_url(self,response):
        type = response.xpath('//div[@class="crumbs-nav center1020"]/span/a[2]/text()').extract_first('')
        print(type)
        name = response.xpath('//div[@class="book-info"]/h1/em/text()').extract_first('')
        print(name)
        author = response.xpath('//div[@class="book-info"]/h1/a/text()').extract_first('')
        print(author)
        total = response.xpath('//p[@class="total"]/span/text()').extract_first('') + response.xpath(
            '//p[@class="total"]/em/text()').extract_first('')
        print(total)
        love = response.xpath('//p[@class="total"]/span[2]/text()').extract_first('') + response.xpath(
            '//p[@class="total"]/em[2]/text()').extract_first('')
        print(love)
        cilk = response.xpath('//p[@class="total"]/span[3]/text()').extract_first('') + response.xpath(
            '//p[@class="total"]/em[3]/text()').extract_first('')
        print(cilk)
        introduce = response.xpath('//p[@class="intro"]/text()').extract_first('')
        print(introduce)
        url = 'https:' + response.xpath('//div[@class="book-img"]//img/@src').extract_first('')
        url = url.replace('\r', '')
        print(url)

        item=HongxiuxiuItem()
        item['type']=type
        item['name'] = name
        item['author'] = author
        item['total'] = total
        item['love'] = love
        item['cilk'] = cilk
        item['introduce'] = introduce
        item['url']=[url]
        yield item





# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

import codecs
import os
import json
class HongxiuxiuPipeline(object):
    def __init__(self):
        self.file=codecs.open(filename='book.json',mode='w+',encoding='utf-8')
        self.file.write('"book_list":[')
    def process_item(self, item, spider):
        res=dict(item)
        str=json.dumps(res,ensure_ascii=False)
        self.file.write(str)
        self.file.write(',\n')
        return item
    def close_spider(self,spider):
        self.file.seek(-1,os.SEEK_END)
        self.file.truncate()

        self.file.seek(-1,os.SEEK_END)
        self.file.truncate()

        self.file.write(']')
        self.file.close()



ITEM_PIPELINES = {
   'hongxiuxiu.pipelines.HongxiuxiuPipeline': 300,
    'scrapy.pipelines.images.ImagesPipeline':1
}
IMAGES_STORE='TUTUTUTUTU'
IMAGES_URLS_FIELD='url'

项目要求将图片进行下载然后将每个图片模块的名称作为下载图片后的存储文件名称

项目要求   将图片进行下载  然后将每个图片模块的名称作为下载图片后的存储文件名称





# -*- coding: utf-8 -*-
import scrapy

from ..items import SucaiItem
class TubiaoSpider(scrapy.Spider):
    name = 'tubiao'
    allowed_domains = ['sc.chinaz.com']
    start_urls = ['http://sc.chinaz.com/']

    def parse(self, response):
        icon_url=response.xpath('//li[@class="nos"]/a[3]/@href').extract_first('')
        full_url='http://sc.chinaz.com'+icon_url
        yield scrapy.Request(url=full_url,callback=self.parse_icon_url)
    def parse_icon_url(self,response):
        a_list=response.xpath('//ul[@class="pngblock imgload"]/li/span/a')
        for a in a_list:
            href=a.xpath('@href').extract_first('')
            title=a.xpath('text()').extract_first('')
            print(title)
            # meta:负责传递往下一个方法发送的内容
            yield scrapy.Request(url=href,meta={'title':title},callback=self.get_detail_with_url)
    def get_detail_with_url(self,reseponse):

        title=reseponse.meta['title']
        img_list=reseponse.xpath('//div[@class="png_sl"]/div/img/@src').extract()
        for img in img_list:
            item=SucaiItem()
            item['title']=title
            item['img']=[img]
            yield item





# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

import scrapy
# 系统中下载图片的管道
from scrapy.pipelines.images import ImagesPipeline
# 系统管道有下载图片的功能  我们的管道继承了系统的管道也有了
# 下载图片的功能
class SucaiPipeline(ImagesPipeline):
    def get_media_requests(self, item, info):
        # print('管道方法执行了')
        # 这个方法会循环执行
        # 前面每次传入一个item  这个item被交给了引擎
        # 引擎又交给了管道来执行   管道里面有很多个方法
        # 这些方法会依次执行
        yield scrapy.Request(url=item['img'][0],meta={'item':item})
    # 管道里面提供了一系列的内置方法，这些方法会自动从第一个执行到最后一个
    def file_path(self, request, response=None, info=None):
        print('/8*/*/*/*965327-**/-*-/-*/*/*/***/*/**/**/*/')
        item = request.meta['item']
        print(item['title'])
        print(item['img'])
        # 设置图片的路径为     类型名称/url地址
        image_name=item['img'][0].split('/')[-1]
        path='%s/%s' % (item['title'],image_name)
        return path






DOWNLOAD_DELAY = 0.3#将其解注释


ITEM_PIPELINES = {
   'sucai.pipelines.SucaiPipeline': 300,

}
IMAGES_STORE='imagesssssss'

下载4k风景图片

下载4k风景图片



# -*- coding: utf-8 -*-
import scrapy
from ..items import TuwangItem
class BianSpider(scrapy.Spider):
    name = 'bian'
    allowed_domains = ['pic.netbian.com']
    start_urls = ['http://pic.netbian.com/4kfengjing/']

    def parse(self, response):
        img_list=response.xpath('//ul[@class="clearfix"]/li//img/@src').extract()
        for img in img_list:
            url='http://pic.netbian.com'+img
            print(url)
            item=TuwangItem()
            item['url']=[url]
            yield item
        next_url=response.xpath('//div[@class="page"]/a[text()="下一页"]/@href').extract()
        if len(next_url) != 0:
            url='http://pic.netbian.com'+next_url[0]
            yield scrapy.Request(url=url,callback=self.parse)








ITEM_PIPELINES = {
   'tuwang.pipelines.TuwangPipeline': 300,
    'scrapy.pipelines.images.ImagesPipeline':1
}
IMAGES_STORE='tutupian'
IMAGES_URLS_FIELD='url'

项目要求将小说爬取出来然后在管道进行json格式的保存

项目要求     将小说爬取出来然后在管道进行json格式的保存






# -*- coding: utf-8 -*-
import scrapy
from ..items import XiaoshuoItem

class XiaoxioashuoSpider(scrapy.Spider):
    name = 'xiaoxiaoshuo'
    allowed_domains = ['readnovel.com']
    start_urls = ['https://www.readnovel.com/']

    def parse(self, response):
        # book_list=response.xpath('//div[@class="book-info"]')
        book_list=response.css('.book-info')
        print(book_list)
        for book in book_list:
            # 获取小说名称
            name=book.xpath('.//h4/a/@title').extract_first('')
            if len(name) ==0:
                name = book.xpath('.//h3/a/@title').extract_first('')

            des=book.xpath('.//p/text()').extract_first('')

            author=book.xpath('.//div[@class="state-box cf"]/a/text()').extract_first('')

            type=book.xpath('.//div[@class="state-box cf"]/i/text()').extract_first('')
            item=XiaoshuoItem()
            item['name']=name
            item['des']=des
            item['author']=author
            item['type']=type
            yield item



# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
# 用来打开指定文件  并且对文件进行转码  防止出现乱码问题
import codecs
import json
import os
class XiaoshuoPipeline(object):
    def __init__(self):
        # w：写文件     r:读文件
        # w+：读写文件    r+：读写文件
        # 前者读写文件  如果文件不存在  则创建
        # 后者读写文件  如果文件不存在  则抛出异常
        self.file=codecs.open(filename='book.json',mode='w+',encoding='utf-8')
        self.file.write('"list:["')
    # 如果想要将数据写入本地或者使用数据库的时候  这个方法需要保留
    def process_item(self, item, spider):
        # 将item对象转化为一个字典对象
        res=dict(item)
        # dumps  将字典对象转化成字符串  ASCII编码是否可用
        # 如果直接将字典形式的数据写入文件当中，会发生错误
        # 所以需要将字典形式的值  转化成字符串写入文件当中
        str = json.dumps(res,ensure_ascii=False)
        # 将数据写入到文件当中
        self.file.write(str)
        self.file.write(',\n')
    def open_spider(self,spider):
        pass
    def close_spider(self,spider):
        # 删除文件当中最后一个字符
        # -1 表示偏移量至文件的末尾
        # SEEK_END  定位到文件的最后一个字符
        self.file.seek(-1,os.SEEK_END)
        # 开始执行
        self.file.truncate()
        self.file.seek(-1, os.SEEK_END)
        self.file.truncate()

        self.file.write(']')
        self.file.close()


#解注释
ITEM_PIPELINES = {
   'xiaoshuo.pipelines.XiaoshuoPipeline': 300,
}

项目要求将数据爬取出来将其保存到数据库中

# -*- coding: utf-8 -*-
import scrapy

from ..items import HonghongxiuxiuItem
class HongxiuxiuSpider(scrapy.Spider):
    name = 'hongxiuxiu'
    allowed_domains = ['hongxiu.com']
    start_urls = ['https://www.hongxiu.com/finish?gender=2&catId=-1']

    def parse(self, response):
        li_list=response.xpath('//div[@class="right-book-list"]/ul/li')
        for li in li_list:
            img='https'+li.xpath('.//div[@class="book-img"]/a/img/@src').extract_first('')
            print(img)
            name=li.xpath('.//div[@class="book-info"]/h3/a/text()').extract_first('')
            print(name)
            author=li.xpath('.//div[@class="book-info"]/h4/a/text()').extract_first('')
            print(author)
            intro=li.xpath('.//p[@class="intro"]/text()').extract_first('')
            print(intro)
            item=HonghongxiuxiuItem()
            item['img']=img
            item['name']=name
            item['author']=author
            item['intro']=intro
            yield item









# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

import sqlite3
class HonghongxiuxiuPipeline(object):
    def process_item(self, item, spider):
        return item
class HongXiuDBPipeline(object):
    def open_spider(self,spider):
        self.connect=sqlite3.connect('hongxiuDB')
        self.cursor=self.connect.cursor()
        self.cursor.execute('create table if not exists bookTable(name text,author text,img text, intro text)')
        self.connect.commit()
    def process_item(self,item,spider):
        self.cursor.execute('insert into bookTable (name,author,img,intro)VALUES ("{}","{}","{}","{}")'.format(item['name'],item['author'],item['img'],item['intro']))
        self.connect.commit()
    def close_spider(self,spider):
        self.cursor.close()
        self.connect.close()






ITEM_PIPELINES = {
   'honghongxiuxiu.pipelines.HonghongxiuxiuPipeline': 300,
    'honghongxiuxiu.pipelines.HongXiuDBPipeline':1
}

项目要求将数据爬取出来进行单位统一

# -*- coding: utf-8 -*-
import scrapy
# https://search.51job.com/list/170200,000000,0000,00,9,99,java,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=
from ..items import JobItem
class JavaSpider(scrapy.Spider):
    name = 'zhiye'
    allowed_domains = ['search.51job.com']
    start_urls = ['https://search.51job.com/list/170200,000000,0000,00,9,99,java,2,1.html?','https://search.51job.com/list/170200,000000,0000,00,9,99,python,2,1.html?']

    def parse(self, response):
        # 1.分离当前页面所有数据  存储到item中
        # 2.获取下一页链接，请求

        div_list=response.xpath('//div[@id="resultList"]/div[@class="el"]')
        for div in div_list:
            # contains：只要属性包含某个值
            jobname=div.xpath('.//p[contains(@class,"t1")]/span/a/@title').extract_first('')
            print(jobname)
            #公司名称
            companyname=div.xpath('.//span[@class="t2"]/a/@title').extract_first('')
            print(companyname)
            cityname=div.xpath('.//span[@class="t3"]/text()').extract_first('')
            print(cityname)
            salary=div.xpath('.//span[@class="t4"]/text()').extract_first('')
            print(salary)
            min_salary=0
            max_salary=0
            if u'年' in salary:
                money = salary.split('万')[0].split('-')
                min_salary=int(money[0])/12
                min_salary='%.1f'% min_salary
                max_salary='%.1f'% (int(money[1])/12)
            elif u'万' in salary:
                money =salary.split('万')[0].split('-')
                min_salary=money[0]
                max_salary=money[1]
            elif u'千' in salary:
                money =salary.split('千')[0]
                if '-' in money:
                    min_salary=float(money.split('-')[0])*0.1
                    max_salary=float(money.split('-')[1])*0.1
                else:
                    min_salary=0
                    max_salary=float(money)*0.1
            elif u'日' in salary:
                money=salary.split('元')
                min_salary=0
                max_salary=int(money[0])*30/10000
            else:
                min_salary=0
                max_salary=0
            date=div.xpath('.//span[@class="t5"]/text()').extract_first('')
            item = JobItem()
            item['jobname']=jobname
            item['companyname'] = companyname
            item['cityname'] = cityname
            item['min_salary'] = min_salary
            item['max_salary'] = max_salary
            item['date'] = date
            yield item

        next_url=response.xpath('//li[@class="bk"]/a[text()="下一页"]/@href').extract()
        print('**********************************************')
        print(next_url)
        if len(next_url) != 0:
            print(next_url[0])
            yield scrapy.Request(url=next_url[0],callback=self.parse)

项目要求:爬取数据将数据存入到MYSQL中

# -*- coding: utf-8 -*-
import scrapy
from ..items import DianyingItem

class TiantangSpider(scrapy.Spider):
    name = 'tiantang'
    allowed_domains = ['ygdy8.net']
    start_urls = ['http://www.ygdy8.net/html/gndy/index.html']

    def parse(self, response):
        detail_list=response.xpath('//div[@class="co_area2"]//tr')
        for datail in detail_list:
            url = 'http://www.ygdy8.net'+datail.xpath('.//td[1]/a[2]/@href').extract_first('')
            print(url)
            yield scrapy.Request(url=url,callback=self.detail_info)
    def detail_info(self,response):
        title=response.xpath('//div[@class="title_all"]//font/text()').extract_first('')
        href =response.xpath('//tr[@style="WORD-WRAP : break-word"]/a/@href').extract_first('')
        print(href)
        item=DianyingItem()
        item['title']=title
        item['href']=href
        yield item






import pymysql
class DianyingPipeline(object):
    def __init__(self):
        self.connect=pymysql.connect(host='localhost',user='root',password='666666',db='movie',port=3306)
        self.cursor=self.connect.cursor()
    def process_item(self, item, spider):
        self.cursor.execute('insert into movieTable(title,href)VALUES ("{}","{}")'.format(item['title'],item['href']))
        self.connect.commit()
        return item
    def close_spider(self,spider):
        self.cursor.close()
        self.connect.close()



ITEM_PIPELINES = {
   'dianying.pipelines.DianyingPipeline': 300,
}

项目要求：将爬取的数据存入json格式并下载图片

# -*- coding: utf-8 -*-
import scrapy

from ..items import HongxiuItem
class HongxiuxiuSpider(scrapy.Spider):
    name = 'hongxiuxiu'
    allowed_domains = ['hongxiu.com']
    start_urls = ['https://www.hongxiu.com/all?catId=30008']

    def parse(self, response):
        li_list=response.xpath('//div[@class="right-book-list"]/ul/li')
        print(li_list)
        for li in li_list:
            img = 'https:'+li.xpath('.//div[@class="book-img"]/a/img/@src').extract_first('')
            title=li.xpath('.//div[@class="book-img"]/a/img/@alt').extract_first('')
            author=li.xpath('.//div[@class="book-info"]/h4/a/text()').extract_first('')
            intro=li.xpath('.//div[@class="book-info"]/p[@class="intro"]/text()').extract_first('')
            item=HongxiuItem()
            item["img"]=[img]
            item["title"]=title
            item["author"]=author
            item["intro"]=intro
            yield item






import scrapy
import codecs
import os
import json
from scrapy.pipelines.images import ImagesPipeline
class HongxiuPipeline(object):
    def __init__(self):
        self.file=codecs.open(filename='hongxiu.json',mode='w+',encoding='utf-8')

    def process_item(self, item, spider):
        res=dict(item)
        str=json.dumps(res,ensure_ascii=False)
        self.file.write(str)
        self.file.write('\n')
        return item
class HongXiuDownloadPipeline(ImagesPipeline):
    def get_media_requests(self, item, info):
        url=item['img'][0]
        yield scrapy.Request(url=url,meta={'item':item})

    def file_path(self, request, response=None, info=None):
        item=request.meta['item']
        bookname=item['title']
        path= bookname+'.jpg'
        return path






ITEM_PIPELINES = {
   'hongxiu.pipelines.HongxiuPipeline': 300,
    'hongxiu.pipelines.HongXiuDownloadPipeline':1

}
IMAGES_STORE='imgggg'

项目要求：爬取51job里的python和Java工作的总页数和链接

# -*- coding: utf-8 -*-
import scrapy
import re

class JobSpider(scrapy.Spider):
    name = 'job'
    allowed_domains = ['51job.com']
    start_urls = ['https://search.51job.com/list/170200,000000,0000,00,9,99,java,2,1.html','https://search.51job.com/list/170200,000000,0000,00,9,99,Python,2,1.html']
    def parse(self, response):
        total_page=response.xpath('//div[@class="p_in"]/span[1]/text()').extract_first('')
        print(total_page)
        # 使用正则取出页码里面的所有的数字
        res=re.compile(r'\d+')
        # 得到的结果是一个对象 ，从对象中取出匹配的结果
        result=re.findall(res,total_page)[0]
        print(result)
        # 获取请求的网页
        url=response.url
        print(url)
        if 'java' in url:
            for page in range(1,int(result)+1):
                java_url='https://search.51job.com/list/170200,000000,0000,00,9,99,java,2,{}.html'.format(page)
                yield scrapy.Request(url=java_url,callback=self.get_detail_with_page)

        else:
            for page in range(1,int(result)+1):
                python_url='https://search.51job.com/list/170200,000000,0000,00,9,99,Python,2,{}.html'.format(page)
                yield scrapy.Request(url=python_url,callback=self.get_detail_with_page)
    def get_detail_with_page(self,response):
        print(response.url)

项目要求：爬取动态数据将数据存入Excel表中，首先应该在中间件里面写上如下代码：

from scrapy.http.response.html import HtmlResponse
from scrapy import signals
class taobaospidermiddleware(object):
    def process_request(self,request,spider):
        if spider.name=='shishang':
            spider.driver.get(request.url)
            spider.driver.implicitly_wait(10)
            response=HtmlResponse(url=spider.driver.current_url,
                                  request=request,
                                  body=spider.driver.page_source,
                                  encoding='utf-8')
            return response

然后到爬虫文件里输入如下代码：

# -*- coding: utf-8 -*-
import scrapy
from ..items import TaobaoItem
from selenium import webdriver
class ShishangSpider(scrapy.Spider):
    name = 'shishang'
    allowed_domains = ['taobao.com']
    start_urls = ['https://s.taobao.com/search?q=%E6%97%B6%E5%B0%9A%E7%94%B7%E9%9E%8B&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306&bcoffset=6&ntoffset=6&p4ppushleft=1%2C48&s=0']
    def __init__(self):
        self.driver=webdriver.PhantomJS()
    def parse(self, response):

        content_list=response.xpath('//div[@class="ctx-box J_MouseEneterLeave J_IconMoreNew"]')
        for x in content_list:
            name=x.xpath('.//div[@class="row row-2 title"]/a').xpath('string(.)').extract()[0].strip('\n').replace(' ','').strip('\n')
            price=x.xpath('.//div[@class="price g_price g_price-highlight"]/strong/text()').extract_first('')
            dian_name=x.xpath('.//div[@class="shop"]/a/span[2]/text()').extract_first('')
            item=TaobaoItem()
            item['name']=name
            item['price']=price
            item['dian_name']=dian_name
            yield item

item文件代码省略

然后就是管道文件将数据存入Excel，代码如下：

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

from openpyxl import Workbook
class TaobaoPipeline(object):
    def __init__(self):
        self.wb=Workbook()
        self.ws=self.wb.active
        self.ws.append(['名称','价格','店铺'])
    def process_item(self, item, spider):
        line=[item['name'],item['price'],item['dian_name']]
        self.ws.append(line)
        self.wb.save('时尚男鞋.xlsx')
        return item
    def spider_closed(self,spider):
        pass

然后到setting文件中将下载中间件和item管道解注释并修改，代码如下：

DOWNLOADER_MIDDLEWARES = {
   'taobao.middlewares.taobaospidermiddleware': 543,
}








ITEM_PIPELINES = {
   'taobao.pipelines.TaobaoPipeline': 300,
}

你可能感兴趣的:(Python)

使用Python的 multiprocessing 模块实现多进程并行计算（上完整代码）小码小李开发语言 python 数据库
使用Python的multiprocessing模块实现多进程并行计算的较为详细复杂的示例代码，用于计算一个较大范围内数字的平方，并将结果汇总。以下是一个更具体、复杂且详尽的多进程并行计算代码示例，用于分析多个大型文本文件中单词出现的频率：importmultiprocessingimporttimeimportrefromcollectionsimportCounter#函数用于读取单个文件内容
You are using pip version 10.0.1, however version 20.0.2 is available.的解决方案柒柒钏小知识点 python
在安装第三方库时出现以下提示：Youareusingpipversion10.0.1,howeverversion20.0.2isavailable.输入：python-mpipinstall--upgradepip结果：还是提示上述错误输入：python-mpipinstall--Upip结果：如下所示，更新完成之后继续安装第三库即可。
【Python】全局解释器锁（Global Interpreter Lock，GIL）彬彬侠 Python基础全局解释器锁 GIL CPython 多进程 C 扩展 python
全局解释器锁（GlobalInterpreterLock，简称GIL）是CPython（Python的标准实现）中的一个机制，它确保同一时刻只有一个线程在执行Python字节码。GIL的主要作用是保护Python内部的数据结构，避免多线程访问共享数据时发生竞争条件，导致数据损坏。GIL的工作原理在Python的多线程环境中，GIL会限制多个线程同时执行Python字节码。尽管操作系统可以调度多个线
C++调用Python程序方法超级大反派@_@ C++c++python 开发语言
前言：在之前做的一个项目中，要使用一段Python的代码。一般来讲可以将Python代码中的功能在C++项目中重构，但是如果Python项目太大，或者这部分是别人写的，自己不清楚整个项目的逻辑，这样重构起来就比较麻烦。这里给出了另外一种实现方法，即利用Python的API使得C++项目可以直接启动Python程序，快速在PC端验证代码功能。急性子可直接看：2.2C++调用python有参有返回值函
vscode中调试Python和C++的混合代码 destiny44123 vscode python c++
文章目录使用流程参考一些差异使用流程参考ExampledebuggingmixedPythonC++inVSCode一些差异这里假设的项目是通过python调用c++的相应共享库(so)文件。首先，新建文件夹.vscode，在其中添加文件配置launch.json.示例如下：{"version":"0.2.0","configurations":[{"name":"(gdb)附加","type":
Python一键搞定Word与PDF文档批量转换 Selina .a python教程 python word pdf
在日常工作中，我们经常需要将Word文档（.docx）转换为PDF格式，或者反过来操作。手动进行这种转换不仅费时费力，还容易出错。为此，我们可以利用Python编写一个批量转换工具，一键搞定Word与PDF文档的转换。本文将详细介绍如何实现这一目标，并提供源码和工具。所需库的安装首先，我们需要安装一些Python库来实现这个功能。推荐使用以下两个库：python-docx：用于处理Word文件内容
【Python】multiprocessing 模块：多进程并行计算彬彬侠 Python基础 multiprocessing 多进程 Process Pool Manager Lock python
Pythonmultiprocessing模块Python的multiprocessing模块用于多进程并行计算，可以充分利用多核CPU进行任务加速，突破PythonGIL（全局解释器锁）的限制，提高程序执行效率。1.为什么使用multiprocessing？Python默认的threading模块使用线程进行并发，但由于GIL（全局解释器锁）的存在，多线程无法真正实现CPU级别的并行计算，适用于
python语言写的一款pdf转word、word转pdf的免费工具典龙330 pdf word
Word与PDF文件转换工具这是一个简单的Web应用程序，允许用户将Word文档转换为PDF文件，或将PDF文件转换为Word文档。功能特点-Word(.docx)转换为PDF-PDF转换为Word(.docx)-简单易用的Web界面-即时转换和下载-详细的错误处理和日志记录安装要求-Python3.7或更高版本-依赖库（见requirements.txt）-对于Word到PDF的转换，建议安装L
python实现KNN算法的手写数字识别：深入解析与完整项目流程快撑死的鱼 Python算法精解算法
随着人工智能和机器学习的快速发展，图像识别技术在多个领域得到广泛应用。而手写数字识别作为图像识别的典型场景之一，已经成为研究者和开发者学习、应用机器学习算法的经典项目。本文将深入解析如何使用Python编程语言，结合KNN（K-最近邻）算法实现手写数字识别系统。文章不仅介绍了算法的核心原理，还从用户交互、图像处理、数据预处理等多个角度对整个项目进行了全方位的讲解。读者通过本文，可以全面掌握手写数字
python webdriver-manager 实现selenium 免下载安装webdriver 小马MT python selenium 爬虫
pythonwebdriver-manager实现selenium免下载安装webdriverselenium在自动化测试中，通常需要使用浏览器驱动来与浏览器进行交互。然而，手动下载、安装、以及管理这些驱动非常麻烦，尤其是当驱动版本频繁更新时。为此，webdriver-manager库提供了一个极简的方案，自动帮我们下载、更新和管理驱动，使Selenium代码更简洁优雅。webdriver-man
python tkinter控件位置_python tkinter组件摆放方式详解 weixin_39895995 python tkinter控件位置
1.最小界面组成#导入tkinter模块importtkinter#创建主窗口对象root=tkinter.Tk()#设置窗口大小(最小值：像素)root.minsize(300,300)#创建一个按钮组件btn=tkinter.Button(root,text='屠龙宝刀，点击送')btn.pack()#加入消息循环root.mainloop()设置初始化界面大小#设置初始化界面大小root.g
python表格控件_Python使用tkinter的Treeview组件实现表格功能 weixin_39619481 python表格控件
fromtkinterimportTk,Scrollbar,Framefromtkinter.ttkimportTreeview#创建tkinter应用程序窗口root=Tk()#设置窗口大小和位置root.geometry('500x300400300')#不允许改变窗口大小root.resizable(False,False)#设置窗口标题root.title('通信录管理系统')#使用Tre
深入探究 Ryu REST API 漫谈网络网络技术进阶通途网络
Ryu4.34RESTAPI详细接口说明与示例Ryu4.34的RESTAPI提供了对SDN网络的核心管理功能，涵盖交换机、流表、端口、拓扑和QoS等操作。以下是详细的接口分类、功能说明及Python示例代码。1.交换机管理1.1获取所有交换机DPID端点:GET/stats/switches功能:返回当前连接到控制器的所有交换机的DPID（数据路径标识符）列表。示例:importrequestsR
python web开发pyramid库安装与使用范哥来了 python
为了在Python中使用Pyramid进行Web开发，你需要先安装Pyramid库。接着我会指导你如何安装它，并给出一个简单的示例来展示如何创建一个基本的Pyramid应用。安装Pyramid确保你的环境中已经安装了pip工具，然后可以通过以下命令安装Pyramid：pipinstallpyramid如果你想要开始一个新的Pyramid项目，推荐同时安装pyramid_starter模板，这可以帮
Python激活码 qq_36357944 Python
EB101IWSWD-eyJsaWNlbnNlSWQiOiJFQjEwMUlXU1dEIiwibGljZW5zZWVOYW1lIjoibGFuIHl1IiwiYXNzaWduZWVOYW1lIjoiIiwiYXNzaWduZWVFbWFpbCI6IiIsImxpY2Vuc2VSZXN0cmljdGlvbiI6IkZvciBlZHVjYXRpb25hbCB1c2Ugb25seSIsImNoZWNrQ
tksheet：强大的Python Tkinter表格组件江连日Silver
tksheet：强大的PythonTkinter表格组件tksheetPython3.6+tkintertablewidgetfordisplayingtabulardata项目地址:https://gitcode.com/gh_mirrors/tk/tksheet项目基础介绍与编程语言tksheet是一个基于Python的Tkinter库开发的高性能表格控件，专为展示和编辑大量的tabular数
tksheet: 强大的Python Tkinter表格控件柏珂卿
tksheet:强大的PythonTkinter表格控件项目地址:https://gitcode.com/gh_mirrors/tk/tksheet在探索Python的GUI库时，你会发现tksheet是一个引人注目的名字。它不仅仅是一款简单的表格插件；实际上，这是一个功能丰富且优化得当的数据管理工具，尤其适合那些依赖于Tkinter构建界面的应用开发者。项目介绍tksheet是基于Tkinter
【Python安装】2024年最新下载安装教程！详细步骤，有这一篇就够了！！！「已注销」 python 开发语言
（点击领取Python安装包+学习资料）Python安装说明1.访问Python官网首先，访问Python的官方网站：WelcometoPython.org。2.下载Python安装程序在官网首页，找到“Downloads”部分。根据你的操作系统（Windows,macOS,Linux等）选择合适的版本下载。对于大多数用户，推荐下载最新版本的Python3.x（例如Python3.9或更高版本）。
Python+Selenium 使用webdriver-manager解决浏览器与驱动不匹配所带来自动化无法执行的问题_web自动化最新版本浏览器驱动,驱动连接不了浏览器 2401_84140040 程序员 python 学习面试
做了那么多年开发，自学了很多门编程语言，我很明白学习资源对于学一门新语言的重要性，这些年也收藏了不少的Python干货，对我来说这些东西确实已经用不到了，但对于准备自学Python的人来说，或许它就是一个宝藏，可以给你省去很多的时间和精力。别在网上瞎学了，我最近也做了一些资源的更新，只要你是我的粉丝，这期福利你都可拿走。我先来介绍一下这些东西怎么用，文末抱走。（1）Python所有方向的学习路线（
python常用内置函数 Tan程序员 python 开发语言
函数作用print()打印输出help()用于查看函数或模块用途的详细说明list()将一个可迭代对象转换成列表tuple()将一个可迭代对象转换成元组set()将一个可迭代对象转化成集合dict()用于创建一个新字典sorted()将一个序列排序，返回排序后的序列reversed()将一个序列反转，返回翻转序列后的迭代器range()用于生成可迭代对象的数值列表的表示eval()执行字符串类型的
解决python tkinter库：_tkinter.TclError: bad window path name “.!button“类似错误 Tan程序员 python 开发语言
本文目录报错信息问题分析问题解决本文将介绍怎么解决pythontkinter库_tkinter.TclError:badwindowpathname".!toplevel.!button3"错误（以及类似错误）报错信息我们在使用tkinter库时可能会遇到类似这样的问题：_tkinter.TclError:badpathname".!button"_tkinter.TclError:badwind
2024 年java 和Python 开发工具系列激活码（持续更新） hhhaadei java ide
7EX1SHUD24-eyJsaWNlbnNlSWQiOiI3RVgxU0hVRDI0IiwibGljZW5zZWVOYW1lIjoibWFvIHplZG9uZyIsImxpY2Vuc2VlVHlwZSI6IlBFUlNPTkFMIiwiYXNzaWduZWVOYW1lIjoiIiwiYXNzaWduZWVFbWFpbCI6IiIsImxpY2Vuc2VSZXN0cmljdGlvbiI6IiIsI
「QT」布局类之 QHBoxLayout 水平布局类何曾参静谧「QT」QT5程序设计 qt 开发语言
✨博客主页何曾参静谧的博客（✅关注、点赞、⭐收藏、转发）文章专栏「QT」QT5程序设计全部专栏（专栏会有变化，以最新发布为准）「Win」Windows程序设计「IDE」集成开发环境「UG/NX」BlockUI集合「C/C++」C/C++程序设计「DSA」数据结构与算法「UG/NX」NX二次开发「QT」QT5程序设计「File」数据文件格式「UG/NX」NX定制开发「Py」Python程序设计「Ma
python+flask实现360全景图和stl等多种格式模型浏览 mosquito_lover1 python
1.安装依赖pipinstallflask2.创建Flask应用创建一个基本的Flask应用，并设置路由来处理不同的文件类型。fromflaskimportFlask,render_template,send_from_directoryapp=Flask(__name__)#设置静态文件路径app.static_folder='static'@app.route('/')defindex():r
AI 之路——数据分析（1）Pandas小结与框架整理 Robin_Pi 机器学习之路数据分析数据分析 python 人工智能可视化
目录1.写在前面1.1AI之路：1.2工具/技能：2.数据分析2.1数据分析的流程2.2数据的基本操作方法2.2.1Pandas概览2.2.2使用Pandas操作数据的核心(1)选择数据(2)操作数据2.2.2数据详解3.写在最后1.写在前面主要是阶段性框架总结1.1AI之路：数据分析——机器学习——深度学习——CV/NLP1.2工具/技能：Python、NumPy、Pandas、Matplotl
Python爬虫教程：如何通过接口批量下载视频封面（FFmpeg技术实现） Python爬虫项目 python 爬虫开发语言数据库数据分析 scrapy selenium
引言随着在线视频平台的蓬勃发展，视频封面作为视频内容的预览图，一直以来都是观众对视频的第一印象。在爬取视频资源时，很多开发者和研究者往往只关注视频本身，而忽略了视频封面。实际上，视频封面不仅能提供重要的信息（例如视频标题、主题或情感等），而且它们也能作为数据集中的重要属性，用于视频分类、推荐系统等应用。在这篇博客中，我们将深入探讨如何使用Python通过接口批量下载视频封面，利用FFmpeg等技术
python vuejs聊天室_ws模块指南+Vue在线聊天室无1234 python vuejs聊天室
简介ws模块是Node端的一个WebSocket协议的实现,该协议允许客户端(一般是浏览器)持久化和服务端的连接.这种可以持续连接的特性使得WebScoket特别适合用于适合用于游戏或者聊天室等使用场景.ws模块相较于其他基于WebSocket协议的模块来说非常的纯粹.他只关注基于WebSocket协议的实现,其他例如Socket.io提供了回退手段,当WebSocket无法使用的时候会利用轮询来
Python的struct模块 smilelance Python python struct alignment string buffer exception
struct模块提供将二进制数据转换为结构化数据或相反的功能，它定义了以下函数和异常：exceptionstruct.errorstruct.pack(fmt,v1,v2,…)返回一个string，string由v1,v2…经过给出的格式fmt组成，参数的个数有和类型要和给出的格式一一对应struct.pack_into(fmt,buffer,offset,v1,v2,…)按照格式fmt将v1,v
python的一些基础知识学习勇敢一点♂ python 学习
列表（list）和元组（tuple）列表和元组，都是一个可以放置任意数据类型的有序集合，比如里面可以同时包含int和string类型都是有序的列表是动态的，长度大小不固定，可以随意地增加、删减或者改变元素。元组是静态的，长度大小固定，无法增加删减或者改变常规操作关于赋值，list可以很轻松的根据索引赋值，但是tuple不可以listA=[1,2,3,4]listA[3]=10print(listA
3月14日复盘四万二千 python 人工智能
挑战AI全栈第四天！（终于双休了）容器python中默认有4种容器列表list字典dict集合set元组tuple一、Python列表（list)Python支持多种复合数据类型，可将不同值组合在一起。最常用的列表，是用方括号标注，逗号分隔的一组值。列表可以包含不同类型的元素，但一般情况下，各个元素的类型相同列表是一种可以存储任意个各种类型的序列容器列表内的数据有先后顺序关系列表是可变的容器1.列
二分查找排序算法周凡杨 java 二分查找排序算法折半
一：概念二分查找又称折半查找（折半搜索/ 二分搜索），优点是比较次数少，查找速度快，平均性能好；其缺点是要求待查表为有序表，且插入删除困难。因此，折半查找方法适用于不经常变动而查找频繁的有序列表。首先，假设表中元素是按升序排列，将表中间位置记录的关键字与查找关键字比较，如果两者相等，则查找成功；否则利用中间位置记录将表分成前、后两个子表，如果中间位置记录的关键字大于查找关键字，则进一步
java中的BigDecimal bijian1013 java BigDecimal
在项目开发过程中出现精度丢失问题，查资料用BigDecimal解决，并发现如下这篇BigDecimal的解决问题的思路和方法很值得学习，特转载。原文地址：http://blog.csdn.net/ugg/article/de
Shell echo命令详解 daizj echo shell
Shell echo命令 Shell 的 echo 指令与 PHP 的 echo 指令类似，都是用于字符串的输出。命令格式： echo string 您可以使用echo实现更复杂的输出格式控制。 1.显示普通字符串: echo "It is a test" 这里的双引号完全可以省略，以下命令与上面实例效果一致： echo Itis a test 2.显示转义
Oracle DBA 简单操作周凡杨 oracle dba sql
--执行次数多的SQL select sql_text,executions from ( select sql_text,executions from v$sqlarea order by executions desc ) where rownum<81; &nb
画图重绘朱辉辉33 游戏
我第一次接触重绘是编写五子棋小游戏的时候，因为游戏里的棋盘是用线绘制的，而这些东西并不在系统自带的重绘里，所以在移动窗体时，棋盘并不会重绘出来。所以我们要重写系统的重绘方法。在重写系统重绘方法时，我们要注意一定要调用父类的重绘方法，即加上super.paint(g)，因为如果不调用父类的重绘方式，重写后会把父类的重绘覆盖掉，而父类的重绘方法是绘制画布，这样就导致我们
线程之初体验西蜀石兰线程
一直觉得多线程是学Java的一个分水岭，懂多线程才算入门。之前看《编程思想》的多线程章节，看的云里雾里，知道线程类有哪几个方法，却依旧不知道线程到底是什么？书上都写线程是进程的模块，共享线程的资源，可是这跟多线程编程有毛线的关系，呜呜。。。线程其实也是用户自定义的任务，不要过多的强调线程的属性，而忽略了线程最基本的属性。你可以在线程类的run()方法中定义自己的任务，就跟正常的Ja
linux集群互相免登陆配置林鹤霄 linux
配置ssh免登陆 1、生成秘钥和公钥 ssh-keygen -t rsa 2、提示让你输入，什么都不输，三次回车之后会在~下面的.ssh文件夹中多出两个文件id_rsa 和 id_rsa.pub 其中id_rsa为秘钥，id_rsa.pub为公钥，使用公钥加密的数据只有私钥才能对这些数据解密 c
mysql : Lock wait timeout exceeded; try restarting transaction aigo mysql
原文：http://www.cnblogs.com/freeliver54/archive/2010/09/30/1839042.html 原因是你使用的InnoDB 表类型的时候, 默认参数:innodb_lock_wait_timeout设置锁等待的时间是50s, 因为有的锁等待超过了这个时间,所以抱错. 你可以把这个时间加长,或者优化存储
Socket编程基本的聊天实现。 alleni123 socket
public class Server { //用来存储所有连接上来的客户 private List<ServerThread> clients; public static void main(String[] args) { Server s = new Server(); s.startServer(9988); } publi
多线程监听器事件模式(一个简单的例子) 百合不是茶线程监听模式
多线程的事件监听器模式监听器时间模式经常与多线程使用,在多线程中如何知道我的线程正在执行那什么内容,可以通过时间监听器模式得到创建多线程的事件监听器模式思路: 1, 创建线程并启动,在创建线程的位置设置一个标记 2,创建队
spring InitializingBean接口 bijian1013 java spring
spring的事务的TransactionTemplate，其源码如下： public class TransactionTemplate extends DefaultTransactionDefinition implements TransactionOperations, InitializingBean{ ... } TransactionTemplate继承了DefaultT
Oracle中询表的权限被授予给了哪些用户 bijian1013 oracle 数据库权限
Oracle查询表将权限赋给了哪些用户的SQL，以备查用。 select t.table_name as "表名", t.grantee as "被授权的属组", t.owner as "对象所在的属组"
【Struts2五】Struts2 参数传值 bit1129 struts2
Struts2中参数传值的3种情况 1.请求参数绑定到Action的实例字段上 2.Action将值传递到转发的视图上 3.Action将值传递到重定向的视图上一、请求参数绑定到Action的实例字段上以及Action将值传递到转发的视图上 Struts可以自动将请求URL中的请求参数或者表单提交的参数绑定到Action定义的实例字段上，绑定的规则使用ognl表达式语言
【Kafka十四】关于auto.offset.reset[Q/A] bit1129 kafka
I got serveral questions about auto.offset.reset. This configuration parameter governs how consumer read the message from Kafka when there is no initial offset in ZooKeeper or
nginx gzip压缩配置 ronin47 nginx gzip 压缩范例
nginx gzip压缩配置更多 0 nginx gzip 配置随着nginx的发展，越来越多的网站使用nginx，因此nginx的优化变得越来越重要，今天我们来看看nginx的gzip压缩到底是怎么压缩的呢？ gzip(GNU-ZIP)是一种压缩技术。经过gzip压缩后页面大小可以变为原来的30%甚至更小，这样，用
java-13.输入一个单向链表，输出该链表中倒数第 k 个节点 bylijinnan java
two cursors. Make the first cursor go K steps first. /* * 第 13 题：题目：输入一个单向链表，输出该链表中倒数第 k 个节点 */ public void displayKthItemsBackWard(ListNode head,int k){ ListNode p1=head,p2=head;
Spring源码学习-JdbcTemplate queryForObject bylijinnan java spring
JdbcTemplate中有两个可能会混淆的queryForObject方法： 1. Object queryForObject(String sql, Object[] args, Class requiredType) 2. Object queryForObject(String sql, Object[] args, RowMapper rowMapper) 第1个方法是只查
[冰川时代]在冰川时代,我们需要什么样的技术? comsci 技术
看美国那边的气候情况....我有个感觉...是不是要进入小冰期了? 那么在小冰期里面...我们的户外活动肯定会出现很多问题...在室内呆着的情况会非常多...怎么在室内呆着而不发闷...怎么用最低的电力保证室内的温度.....这都需要技术手段... &nb
js 获取浏览器型号 cuityang js 浏览器
根据浏览器获取iphone和apk的下载地址 <!DOCTYPE html> <html> <head> <meta charset="utf-8" content="text/html"/> <meta name=
C# socks5详解转 dalan_123 socket C#
http://www.cnblogs.com/zhujiechang/archive/2008/10/21/1316308.html 这里主要讲的是用.NET实现基于Socket5下面的代理协议进行客户端的通讯，Socket4的实现是类似的，注意的事，这里不是讲用C#实现一个代理服务器，因为实现一个代理服务器需要实现很多协议，头大，而且现在市面上有很多现成的代理服务器用，性能又好，
运维 Centos问题汇总 dcj3sjt126com 云主机
一、sh 脚本不执行的原因 sh脚本不执行的原因只有2个 1.权限不够 2.sh脚本里路径没写完整。二、解决You have new mail in /var/spool/mail/root 修改/usr/share/logwatch/default.conf/logwatch.conf配置文件 MailTo = MailFrom 三、查询连接数
Yii防注入攻击笔记 dcj3sjt126com sql WEB安全 yii
网站表单有注入漏洞须对所有用户输入的内容进行个过滤和检查，可以使用正则表达式或者直接输入字符判断，大部分是只允许输入字母和数字的，其它字符度不允许；对于内容复杂表单的内容，应该对html和script的符号进行转义替换：尤其是<,>,',"",&这几个符号这里有个转义对照表： http://blog.csdn.net/xinzhu1990/articl
MongoDB简介[一] eksliang mongodb MongoDB简介
MongoDB简介转载请出自出处：http://eksliang.iteye.com/blog/2173288 1.1易于使用 MongoDB是一个面向文档的数据库，而不是关系型数据库。与关系型数据库相比，面向文档的数据库不再有行的概念，取而代之的是更为灵活的“文档”模型。另外，不
zookeeper windows 入门安装和测试 greemranqq zookeeper 安装分布式
一、序言以下是我对zookeeper 的一些理解： zookeeper 作为一个服务注册信息存储的管理工具，好吧，这样说得很抽象，我们举个“栗子”。栗子1号：假设我是一家KTV的老板，我同时拥有5家KTV，我肯定得时刻监视
Spring之使用事务缘由(2-注解实现) ihuning spring
Spring事务注解实现 1. 依赖包： 1.1 spring包： spring-beans-4.0.0.RELEASE.jar spring-context-4.0.0.
iOS App Launch Option 啸笑天 option
iOS 程序启动时总会调用application:didFinishLaunchingWithOptions:，其中第二个参数launchOptions为NSDictionary类型的对象，里面存储有此程序启动的原因。 launchOptions中的可能键值见UIApplication Class Reference的Launch Options Keys节。 1、若用户直接
jdk与jre的区别（_） macroli java jvm jdk
简单的说JDK是面向开发人员使用的SDK，它提供了Java的开发环境和运行环境。SDK是Software Development Kit 一般指软件开发包，可以包括函数库、编译程序等。 JDK就是Java Development Kit JRE是Java Runtime Enviroment是指Java的运行环境，是面向Java程序的使用者，而不是开发者。如果安装了JDK，会发同你
Updates were rejected because the tip of your current branch is behind qiaolevip 学习永无止境每天进步一点点众观千象 git
$ git push joe prod-2295-1 To [email protected]:joe.le/dr-frontend.git ! [rejected] prod-2295-1 -> prod-2295-1 (non-fast-forward) error: failed to push some refs to '[email protected]
[一起学Hive]之十四-Hive的元数据表结构详解 superlxw1234 hive hive元数据结构
关键字：Hive元数据、Hive元数据表结构之前在 “[一起学Hive]之一–Hive概述，Hive是什么”中介绍过，Hive自己维护了一套元数据，用户通过HQL查询时候，Hive首先需要结合元数据，将HQL翻译成MapReduce去执行。本文介绍一下Hive元数据中重要的一些表结构及用途，以Hive0.13为例。文章最后面，会以一个示例来全面了解一下，
Spring 3.2.14，4.1.7，4.2.RC2发布 wiselyman Spring 3
Spring 3.2.14、4.1.7及4.2.RC2于6月30日发布。其中Spring 3.2.1是一个维护版本(维护周期到2016-12-31截止)，后续会继续根据需求和bug发布维护版本。此时，Spring官方强烈建议升级Spring框架至4.1.7 或者将要发布的4.2 。其中Spring 4.1.7主要包含这些更新内容。