第一个爬虫应用
该代码主要爬取电子工业出版社http://www.phei.com.cn首页的内容。
#引入requests模块 import requests #定义get_content函数 def get_content(url): resp = requests.get(url) return resp.text #"__name__ == '__main__'"的作用是被别的文件import时候,以下代码不会主动地执行 if __name__ == '__main__': #定义url,值为要抓取的目标网站网址 url="http://www.phei.com.cn" #调用函数返回赋值给content content = get_content(url) #打印输出content的前50个字符 print("前50个字符为:",content[0:50]) #打印输出content的长度 content_len = len(content) print("内容长度为:",content_len) #判断内容长度是否大于40KB if content_len >=40 * 1024: print("内容的长度大于等于40KB") else: print("内容的长度小于40KB")
以下是输出内容:
前50个字符为:字典推导表达式
urls_d = {i: "www.xyz.com/?page={}".format(i) for i in range(1,11)}以下是输出内容:
{1: 'www.xyz.com/?page=1', 2: 'www.xyz.com/?page=2', 3: 'www.xyz.com/?page=3', 4: 'www.xyz.com/?page=4', 5: 'www.xyz.com/?page=5', 6: 'www.xyz.com/?page=6', 7: 'www.xyz.com/?page=7', 8: 'www.xyz.com/?page=8', 9: 'www.xyz.com/?page=9', 10: 'www.xyz.com/?page=10'}第二个爬虫应用
将字典、列表、元组、集合、循环、异常、文件操作融合在一起
import requests urls_dict={ '电子工业出版社':'http://www.phei.com.cn', '在线资源':'http://www.phei.com.cn/module/zygl/zxzyindex.jsp', 'xyz':'www.phei.com.cn', '网上书店1':'http://www.phei.com.cn/module/goods/wssd_index.jsp', '网上书店2':'http://www.phei.com.cn/module/goods/wssd_index.jsp' } urls_lst=[ ('电子工业出版社','http://www.phei.com.cn'), ('在线资源','http://www.phei.com.cn/module/zygl/zxzyindex.jsp'), ('xyz','www.phei.com.cn'), ('网上书店1','http://www.phei.com.cn/module/goods/wssd_index.jsp'), ('网上书店2','http://www.phei.com.cn/module/goods/wssd_index.jsp') ] #利用字典抓取 crawled_urls_for_dict=set() for ind,name in enumerate(urls_dict.keys()): name_url = urls_dict[name] if name_url in crawled_urls_for_dict: print(ind,name,'已经抓取过了') else: try: resp = requests.get(name_url) except Exception as e: print(ind,name,':',str(e)[0:50]) continue content=resp.text crawled_urls_for_dict.add(name_url) with open('bydict_'+name+'.html','w') as f: f.write(content) print('抓取完成:{} {},内容长度为{}'.format(ind,name,len(content))) for u in crawled_urls_for_dict: print(u) print('-' * 60) #利用列表抓取 crawled_urls_for_list=set() for ind,tup in enumerate(urls_lst): name=tup[0] name_url = tup[1] if name_url in crawled_urls_for_list: print(ind,name,'已经抓取过了') else: try: resp = requests.get(name_url) except Exception as e: print(ind,name,':',str(e)[0:50]) continue content=resp.text crawled_urls_for_list.add(name_url) with open('bydict_'+name+'.html','w') as f: f.write(content) print('抓取完成:{} {},内容长度为{}'.format(ind,name,len(content))) for u in crawled_urls_for_list: print(u)以下是输出内容:
抓取完成:0 在线资源,内容长度为49279 抓取完成:1 网上书店2,内容长度为130100 抓取完成:2 电子工业出版社,内容长度为102494 3 网上书店1 已经抓取过了 4 xyz : Invalid URL 'www.phei.com.cn': No schema supplied. http://www.phei.com.cn/module/zygl/zxzyindex.jsp http://www.phei.com.cn/module/goods/wssd_index.jsp http://www.phei.com.cn