七月算法《python爬虫》第一课:Python爬虫小示例

七月算法课程《python爬虫》第一课里的示例代码,很简单。
下载地址:http://download.csdn.net/detail/nnnnnnnnnnnny/9715077

jupyter notebook文件贴在这里了。

七月算法 Python爬虫项目班

课后习题一

爬邮政编码查询网页http://www.ip138.com/post/ ,提取到每个省份邮政编码的开头数字

import requests  #比urllib好用
import xml.etree.ElementTree as ET
from xml.parsers.expat import ParserCreate

class DefaultSaxHandler(object):
    def __init__(self, provinces):
        self.provinces=provinces
    # 处理标签开始
    def start_element(self, name, attrs):
        if name != 'map':
            name=attrs['title']
            number=attrs['href']
            self.provinces.append((name, number))
    # 处理标签结束
    def end_element(self, name):
        pass
    # 文本处理
    def char_data(self, text):
        pass

def get_province_entry(url):
    # 获取文本,并用gb2312解码
    content = requests.get(url).content.decode('gb2312')
#    print(type(content), content)  # content为字符串类型
    # 确定要查找字符串的开始位置,并用切片获取内容
    start=content.find('')
    end=content.find('')
    content=content[start:end+len('')].strip() #获取所需的字段并去掉首尾的空格
#    print("content:", content)
    provinces=[]
    # 生成Sax处理器, SAX(simple API for XML)是一种XML解析的替代方法, 详情百科
    handler=DefaultSaxHandler(provinces)
    # 初始化分析器
    parser=ParserCreate()
    parser.StartElementHandler=handler.start_element
    parser.EndElementHandler=handler.end_element
    parser.CharacterDataHandler=handler.char_data
    #解析数据
    parser.Parse(content)
    # 结果字典为每一页的入口代码
    return provinces

provinces=get_province_entry('http://www.ip138.com/post')
print("provinces:", provinces)
provinces: [('新疆', '/83/'), ('西藏', '/85/'), ('青海', '/81/'), ('甘肃', '/73/'), ('四川', '/61/'), ('云南', '/65/'), ('宁夏', '/75/'), ('内蒙古', '/01/'), ('黑龙江', '/15/'), ('吉林', '/13/'), ('辽宁', '/11/'), ('河北', '/50/'), ('北京', '/10/'), ('天津', '/30/'), ('陕西', '/71/'), ('山西', '/03/'), ('山东', '/25/'), ('河南', '/45/'), ('重庆', '/40/'), ('湖北', '/43/'), ('安徽', '/23/'), ('江苏', '/21/'), ('上海', '/20/'), ('贵州', '/55/'), ('广西', '/53/'), ('湖南', '/41/'), ('江西', '/33/'), ('浙江', '/31/'), ('福建', '/35/'), ('广东', '/51/'), ('海南', '/57/'), ('台湾', '/taiwang/'), ('澳门', '/aomen/'), ('香港', '/xianggang/')]

课后习题二

爬http://hq.sinajs.cn/list= ,看看几个公司的股价信息

import requests
import threading

def display_info(code):
    url='http://hq.sinajs.cn/list=' + code
    response=requests.get(url).text  # 核心,爬下内容直接打印输出
    print(response)

def single_thread(codes):
    for code in codes:
        code = code.strip()
        display_info(code)

def multi_thread(tasks):
#    print("tasks:", tasks)
    # 用列表推导生成线程,注意只有一个元素的tuple定义时必须加一个逗号! 具体参见廖雪峰的python3基础教程 "tuple"
#    print((['sh600007', 'sh600008', 'sh600009']), (['sh600007', 'sh600008', 'sh600009'], ))
    # 关于多线程这一块可查看 廖雪峰的python3基础教程 "多线程"
    threads = [threading.Thread(target = single_thread, args=(codes, )) for codes in tasks]
    # 启动线程
    for t in threads:
        t.start()
    # 等待直到线程结束
    for t in threads:
        t.join()

# 注意main函数的形式
if __name__ == '__main__':
    codes=['sh600004', 'sh600005', 'sh600006', 'sh600007', 'sh600008', 'sh600009']
#    print("len(codes):", len(codes))
    # 计算每个线程要做多少工作
    thread_len=int(len(codes) / 4)
#    print("thread_len:", thread_len)
    t1=codes[0:thread_len]
    t2=codes[thread_len : thread_len*2]
    t3=codes[thread_len*2 : thread_len * 3]
    t4=codes[thread_len * 3:]
#    print("t1:", t1, "t2:", t2, "t3:", t3, "t4:", t4)

    # 多线程启动
    multi_thread([t1, t2, t3, t4])
var hq_str_sh600004="白云机场,13.960,14.040,14.060,14.130,13.950,14.060,14.070,1511419,21244732.000,14186,14.060,257400,14.050,31900,14.040,6300,14.020,2200,14.010,10500,14.070,26100,14.080,81001,14.090,68300,14.100,12000,14.110,2016-12-19,10:40:27,00";
var hq_str_sh600005="武钢股份,3.490,3.510,3.520,3.560,3.460,3.520,3.530,20048507,70516972.000,937000,3.520,454000,3.510,776500,3.500,382300,3.490,678400,3.480,570361,3.530,1529951,3.540,1032710,3.550,1677780,3.560,797700,3.570,2016-12-19,10:40:27,00";

var hq_str_sh600006="东风汽车,6.930,6.940,6.890,6.930,6.800,6.890,6.900,11088555,76080639.000,3400,6.890,46300,6.880,81014,6.870,176900,6.860,130300,6.850,51300,6.900,78300,6.910,65900,6.920,54200,6.930,70800,6.940,2016-12-19,10:40:27,00";


var hq_str_sh600007="中国国贸,17.800,17.890,17.110,17.800,17.090,17.120,17.130,3981125,68517288.000,1500,17.120,33325,17.110,41800,17.100,4100,17.090,21300,17.080,541,17.130,9600,17.140,4000,17.150,11800,17.160,10200,17.170,2016-12-19,10:40:27,00";

var hq_str_sh600008="首创股份,4.110,4.100,4.100,4.120,4.080,4.100,4.110,4128304,16940208.000,505396,4.100,418100,4.090,732200,4.080,345000,4.070,204300,4.060,379200,4.110,561812,4.120,531400,4.130,654600,4.140,399900,4.150,2016-12-19,10:40:18,00";

var hq_str_sh600009="上海机场,26.510,26.620,26.620,26.690,26.350,26.610,26.620,919588,24375139.000,2400,26.610,4400,26.600,700,26.580,3400,26.560,500,26.550,200,26.620,9400,26.630,1500,26.650,9100,26.660,8600,26.680,2016-12-19,10:40:24,00";

你可能感兴趣的:(Python)