Python爬虫

  • 爬去网页小说
  1. from bs4 import BeautifulSoup
    import urllib.request
    url = "http://www.jueshitangmen.info/tian-meng-bing-can-11.html"
    html = urllib.request.urlopen(url).read().decode('utf-8')
    soup = BeautifulSoup(html,features='lxml')
    #爬

    中文字 all_p =soup.find_all('p') for i in all_p: print('\n',i.get_text())

  • 爬取百度图片
  • import urllib.request
    import urllib.parse
    import os
    import re
    #添加header,其中Referer是必须的,否则会返回403错误,User-Agent是必须的,这样才可以伪装成浏览器进行访问
    header=\
    {
         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
         "referer":"https://image.baidu.com"
        }
    url = "https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord={word}&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&word={word}&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&cg=girl&pn={pageNum}&rn=30&gsm=1e00000000001e&1490169411926="
    keyword=input("请输入关键字")
    keyword =urllib.parse.quote(keyword,'utf-8')#文字转码
    n = 0#页数
    j = 0#图片名字
    error= 0#错误
    while n <3000:
        n+=1
        #获取请求
        url1=url.format(word =keyword,pageNum = str(n))#获取索要查询的关键字
        rep  = urllib.request.Request(url1,headers=header)
        rep = urllib.request.urlopen(rep )
        #获取数据
        try:
            html = rep.read().decode('utf-8')
        except:
            print("数据出错")
            error = 1
            print("当前页是",str(n))
            if error ==1:
                continue
        #利用正则取出图片网址
        pattern = re.compile('thumbURL":"(.*?)"')
        data =re.findall(pattern,html)
        #设置下载位置
        if os.path.isdir("D://pictures/图片")!=True:
            os.makedirs(r"D://pictures/图片")
        #下载
        for i in data:
            print(i)
            urllib.request.urlretrieve(i,"D://pictures/图片/pic{num}.jpg".format(num =j))
            j+=1
            print("总爬取图片数"+str(j))

  • 爬取猫眼电影Top100相关信息
  • import urllib.request
    import urllib.parse
    import re
    import os
    import json
    from multiprocessing import Pool
    import re
    
    
    header=\
    {
         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
         "referer":"https://image.baidu.com"
        }
    #获取网页地址
    def get_one_page(url):
            rep = urllib.request.Request(url, headers=header)
            rep = urllib.request.urlopen(rep)
            s = rep.read().decode("utf-8")
            return s
    #利用正则找到需要的资源
    def parse_one_page(html):
        pattern = re.compile('
    .*?board-index.*?>(\d+).*?data-src="(.*?)".*?name">(.*?)' + '.*?star">(.*?)

    .*?releasetime">(.*?)

    .*?integer">(.*?).*?' + '.*?fraction">(.*?).*?
    ', re.S) items = re.findall(pattern, html) for item in items: yield { 'index': item[0], 'image': item[1], 'title': item[2], 'actor': item[3].strip()[3:], 'time': item[4].strip()[5:], 'score': item[5]+item[6] } #写入文件 def writedata(data): with open("maoyan.txt",'a',encoding='utf-8')as f: f.write(json.dumps(data,ensure_ascii=False)+'\n') def main(num): url = 'http://maoyan.com/board/4?offset='+str(num) html = get_one_page(url) parse_one_page(html) for item in parse_one_page(html): print(item) writedata(item) if __name__ == '__main__': #创建线程池 pool = Pool() pool.map(main,[i*10 for i in range(10)])



你可能感兴趣的:(Python)