记 爬虫 关于 遇到空就退出爬取 进行下一条的代码

import  requests
from lxml import  etree
from bs4 import  BeautifulSoup

# 处理字符串中的空白符,并拼接字符串
def processing(strs):
    s = ''  # 定义保存内容的字符串
    for n in strs:
        n = ''.join(n.split())  # 去除空白符
        s = s + n  # 拼接字符串
    return s  # 返回拼接后的字符串

def get_info(url,headers):

    # print(url)
    response=requests.get(url=url,headers=headers)
    response.encoding='utf-8'
    content=response.text
    html=etree.HTML(content)
    id_name=html.xpath("""//h1[@class='detail-title am-margin-bottom-xs']/text()""")
    if not id_name:
        return
    id_pric=html.xpath("""//b[@class='goods-price']/text()""")
    id_read=html.xpath("""//span[@class='tm-count']/text()""")
    id_read=id_read[1]
    id_own=html.xpath("""//span[@class='stock']/text()""")
    id_name=processing(id_name)
    # print(id_name)

    recoreds=[]
    recoreds.append(str(i)+str(id_name)+str(id_pric)+str(id_own)+str(id_read))
    # print(recoreds)
    return recoreds
def to_save(recoreds):
    with open('goods.txt', 'a', encoding='utf-8') as fp:
            for i in recoreds:
                fp.write(str(i) + '\n')
if __name__ == '__main__':
    url=''
    headers = {
        'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0'
    }
    for i in range(1,2000):
        url='http://123.57.186.250/index.php?s=/index/goods/index/id/{}.html'.format(i)
        recoreds=get_info(url,headers)
        # print(recoreds)
        if recoreds is None:
            continue
        else:
            print(i)
            to_save(recoreds)

你可能感兴趣的:(爬虫,数学建模)