对动态网页进行逆向工程

通过搜索字母表的每个字母,然后遍历json响应的结果页面,来抓取所有国家信息。其产生结果将会存储在表格当中。

import string
from downloader import Downloader
import json
D = Downloader()
template_url = 'http://example.webscraping.com/ajax/search.json?page={}&page_size=10&search_term={}'
countries = set()

for letter in string.lowercase:
    page = 0
    while True:
        html = D(template_url.format(page, letter))
        try:
            ajax = json.loads(html)#将json格式的数据解析成一个字典
        except ValueError as e:
            print e
            ajax = None
        else:
            for record in ajax['records']:
                countries.add(record['country'])
        page += 1
        if ajax is None or page >= ajax['num_pages']:
            break
open('countries.txt', 'w').write('\n'.join(sorted(countries)))

你可能感兴趣的:(python爬虫)