2week-homework

总结


2week-homework_第1张图片
Result.png

代码


# Spider 1
def get_url_list_from(channel, page, who_sells=''):
    list_view = '{}/{}o{}'.format(channel, who_sells, page)
    wb_data = requests.get(list_view, headers=headers, proxies=proxies, timeout=6)
    if wb_data.status_code == 200:
        soup = BeautifulSoup(wb_data.text, 'lxml')
        if soup.find('div', 'pageBox'):
            links = soup.select('dl.list-bigpic > dt > a')
            data = {
                'link': list(map(lambda x: x.get('href'), links))
            }
            # print(data)
            use_urls = []
            for item in data['link']:
                if item.find('ganji.com') == -1:
                    pass
                else:
                    use_urls.append(item)
                    print('item'+item)
                    url_list.insert_one({'url': item})
                    get_item_info(item)
            print(use_urls)
        else:
            print('last page: ' + str(page))
    else:
        print('status_code is ' + str(wb_data.status_code))



# Spider 2
def get_item_info(url):
    wb_data = requests.get(url, headers=headers)
    if wb_data.status_code == 404:
        print('404 not found')
    else:
        try:
            soup = BeautifulSoup(wb_data.text, 'lxml')

            prices = soup.select('.f22.fc-orange.f-type')
            pub_dates = soup.select('.pr-5')
            areas = soup.select('ul.det-infor > li:nth-of-type(3) > a')
            cates = soup.select('ul.det-infor > li:nth-of-type(1) > span > a')

            # print(pub_dates, cates, sep='\n===========\n')
            data = {
                'title': soup.title.text.strip(),
                'price': prices[0].text.strip() if len(prices) > 0 else 0,
                'pub_date': pub_dates[0].text.strip().replace(u'\xa0', u' ') if len(pub_dates) > 0 else "",
                'area': list(map(lambda x: x.text.strip(), areas)),
                'cate': list(map(lambda x: x.text.strip(), cates)),
                'url': url
            }

            print(data)
            item_info.insert_one(data)
        except AttributeError:
            print('shits happened!')
        except UnicodeEncodeError:
            print('shits happened again!')

总结


  • 1. 断点续传

db_urls = [item['url'] for item in url_list.find()]
index_urls = [item['url'] for item in item_info.find()]
x = set(db_urls)
y = set(index_urls)
rest_of_urls = x-y

 pool.map(get_item_info, rest_of_urls)

思路也主要是逻辑上的,不是真正网络的断点续传,而是在Table-1:link_list和Table-2:item_info,插入一个相同的key——URL。
利用url的相同,在Table-1现行获取完url,数量不变的前提下,请求Table-2的数据时,即使发生中断冗余问题,也可根据url数量上的的绝对差值,
来继续Table-2的数据爬取。

  • 2. Set 函数

a = ['1','1','2','3','5']
b = ['1','1','2','5']

x = set(a)      // ['1', '2', '3', '5'] set中的顺序不确定,只是将list中的相同内容剔除掉
y = set(b)

// 有一点需要注意的就是,这里的x和y还只是一个set而不是list,需要list的话,还有再 list(x)才可以。

print( x - y)   // ['3'] ,set可以直接做减法,将重合部分减掉
  • 3. Pool进程池的使用

// 获取cpu数量的进程
pool = multiprocessing.Pool(multiprocessing.cpu_count())

close()           关闭pool,使其不在接受新的任务。

terminate()       结束工作进程,不在处理未完成的任务。

join()            主进程阻塞,等待子进程的退出, join方法要在close或terminate之后使用。

具体用法如下:

pool = multiprocessing.Pool(multiprocessing.cpu_count())

pool.map(get_all_links_from, channelist.split())

# pool.map(get_item_info, rest_of_urls)

pool.close()

pool.join()

参考文章: 进程池

  • 4. Headers

headers = {
    'UserAgent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.21 Safari/537.36',
    'Connection': 'keep-alive'
}

headers = {
        'User-Agent':r'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36',
        'Cookie':r'id58=c5/ns1ct99sKkWWeFSQCAg==; city=bj; 58home=bj; ipcity=yiwu%7C%u4E49%u4E4C%7C0; als=0; myfeet_tooltip=end; bj58_id58s="NTZBZ1Mrd3JmSDdENzQ4NA=="; sessionid=021b1d13-b32e-407d-a76f-924ec040579e; bangbigtip2=1; 58tj_uuid=0ed4f4ba-f709-4c42-8972-77708fcfc553; new_session=0; new_uv=1; utm_source=; spm=; init_refer=; final_history={}; bj58_new_session=0; bj58_init_refer=""; bj58_new_uv=1'.format(str(infoid)),
        'Accept': '*/*',
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Accept-Language': 'zh-CN,zh;q=0.8',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        'Host':'jst1.58.com',
        'Referer':r'http://bj.58.com/pingbandiannao/{}x.shtml'.format(info_id)
}
  • 5. lambda 匿名函数

list(map(lambda x: x * x, [1, 2, 3, 4, 5, 6, 7, 8, 9]))

[1, 4, 9, 16, 25, 36, 49, 64, 81]


links = soup.select('dl.list-bigpic > dt > a')

data = {
       'link': list(map(lambda x: x.get('href'), links))
}


等价于:

 [cate.text.strip() for cate in cates]

基本上比较常用的用法是,在map中使用,对一个list中的元素进行操作,代码量极大的缩减。

  • 6. try except 函数

      try:
            soup = BeautifulSoup(wb_data.text, 'lxml')

            prices = soup.select('.f22.fc-orange.f-type')
            pub_dates = soup.select('.pr-5')
            areas = soup.select('ul.det-infor > li:nth-of-type(3) > a')
            cates = soup.select('ul.det-infor > li:nth-of-type(1) > span > a')

            # print(pub_dates, cates, sep='\n===========\n')
            data = {
                'title': soup.title.text.strip(),
                'price': prices[0].text.strip() if len(prices) > 0 else 0,
                'pub_date': pub_dates[0].text.strip().split(' ')[0] if len(pub_dates) > 0 else "",
                'area': list(map(lambda x: x.text.strip(), areas)),
                'cate': list(map(lambda x: x.text.strip(), cates)),
                'url': url
            }

            print(data)
            item_info.insert_one(data)

        except AttributeError:
            print('shits happened!')

        except UnicodeEncodeError:
            print('shits happened again!')
def try_to_make(a_mess):
    try:
        print(1/a_mess)
    except (ZeroDivisionError,TypeError): // 可以并列一起写
        print('ok~')


try_to_make('0')

因为在爬取大量数据时,可能会碰到各种问题,但是检查修改起来其实并不容易,能看懂并结局问题所在最好,但是如果出现一些匪夷所思或者不影响抓取信息的错误,可以采取,except UnicodeEncodeError类似的手法,规避错误。

  • 7. 标点符号 punctuation

from string import punctuation

if i['area'] :
    area = [ i for i in i['area] if i not in punctuation ]

你可能感兴趣的:(2week-homework)