Python实例:网络爬虫抓取豆瓣3万本书(6)

# -*- coding:utf-8 -*-
import time
from tool.ExcelManager import readExcel  
from tool.DbManager import DbManager  
# 合并各标签列表页excel到数据库
# 第六步:书表去重并写入数据库,
# 读取Excel,判断是否重复,先加入book表,重复则往booktag表插入标签记录

start = time.clock()
taglist = readExcel('web/booktag.xlsx') # 读取标签列表
del taglist[0]
dbManager = DbManager()
for tag in taglist: # 遍历所有标签
    kind = tag[0] # 大类
    tagname = tag[1] # 标签
    excelpath = 'books/'+kind+'/'+tagname+'.xlsx' # 本地文件
    try:
        datas = readExcel(excelpath)
    except Exception as e:
        print(e)
        continue
    del datas[0] # 去掉标题
    #print(datas)
    # 提取图书插入数据库
    for data in datas:
        #提取各个字段的数据
        bookname = data[0].replace("'","\\'").replace('"','\\"')#书名
        bookurl = data[1].replace("'","\\'").replace('"','\\"')#url
        bookimage = data[2].replace("'","\\'").replace('"','\\"')#图片地址
        bookno = bookurl.split('/')[-2].replace("'","\\'").replace('"','\\"')#编号
        try:
            bookinfo = data[3].replace("'","\\'").replace('"','\\"')
        except:
            bookinfo = ''
            pass
        try:
            bookstar = data[4]
        except:
            bookstar = '0'
            pass
        #构造查询函数select * from `book` where `bookno`='dc'
        searchsql1 = "select * from `book` where `bookno`='"+bookno+"'"
        print(searchsql1)
        try:
            #先查询一下数据是否已经存在了
            isexist1 = dbManager.execQuery(searchsql1)
        except Exception as e:
            print(e)
            continue
        # 如果图书记录存在,插Booktag表
        if isexist1:
            print(bookname+':'+bookurl+'已经存在')
            #已经存在的
        else:
            #插入数据
            insertbooksql = "INSERT INTO `book` (`bookname`, `bookurl`, `bookimg`, `bookinfo`, `bookstar`, `bookno`) VALUES ('" \
                        "{bookname}', '{bookurl}', '{bookimg}', '{bookinfo}', '{bookstar}', '{bookno}')"
            insert1 = insertbooksql.format(bookname=bookname, bookurl=bookurl, bookimg=bookimage, bookinfo=bookinfo, bookstar=bookstar, bookno=bookno)
            print(insert1)
            try:
                dbManager.execNonQuery(insert1)
            except Exception as e:
                print(e)
                pass
        # 如果图书标签存在,则不插入
        searchsql = "select * from `booktag` where `bookno`='{bookno}' and `booktag`='{booktag}' and `bookkind`='{bookkind}'"
        searchsql2 = searchsql.format(bookno=bookno,booktag=tagname,bookkind=kind)
        print(searchsql2)
        try:
            isexist2 = dbManager.execQuery(searchsql2)
        except Exception as e:
            print(e)
            pass
        if isexist2.__len__()==0:
            inserttag = "INSERT INTO `booktag`(`bookname`,`bookno`,`booktag`,`bookkind`) VALUES ('" \
                        "{bookname}', '{bookno}', '{booktag}', '{bookkind}')"
            insert2 = inserttag.format(bookname=bookname, bookno=bookno, booktag=tagname, bookkind=kind)
            print(insert2)
            try:
                dbManager.execNonQuery(insert2)
            except Exception as e:
                print(e)
                pass
        print('-'*100)
print("插入数据库结束")
end = time.clock()
print("合并图书列表进数据库总共运行时间 : %.03f 秒" %(end-start))

你可能感兴趣的:(Python,肥宝的实验室)