# -*- coding: utf-8 -*
import requests
import re
from bs4 import BeautifulSoup
import MySQLdb
#数据库连接
conn =MySQLdb.connect(host='192.168.0.129',user='root',passwd='123456',db='take_data',charset='utf8')
cur= conn.cursor()
#获取高考省份名字
url="https://www.eol.cn/e_html/gk/tjxk/index.shtml"
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}
data= requests.get(url, headers=headers).content
s = re.findall(r"(.*)新", data)
gkcs=s[0]
url =" https://www.eol.cn/e_html/gk/tjxk/js/index.js"
URL ="https://www.eol.cn/e_html/gk/tjxk/html/html_"
data =requests.get(url, headers=headers).content
s = re.findall(r"html_(.*)'\r", data)
for i in s:
url=URL+i
data = requests.get(url, headers=headers).content
soup = BeautifulSoup(data, "html.parser")
ss = soup.find_all('tr')
for tr in ss:
tds=tr.find_all("td")
year=tds[0].string
city = tds[1].string
school = str(tds[2].string).replace(" ", "")
cenci = tds[3].string
tech=str(tds[4].string).replace(" ", "")
kemu = tds[5].string
cur.execute("insert into aaaaaa values (%s,%s,%s,%s,%s,%s,%s)",(gkcs, year, city, school, cenci, tech, kemu))
#天津省的吉林JS文件有问题单独处理
if str(city)=="吉林":
continue
ssss = str(kemu)
if ssss.find("/")>=0:
aa=ssss.split("/")
if len(aa) == 2:
for i in aa:
cur.execute("insert into aaaaaa values (%s,%s,%s,%s,%s,%s,%s)",
(gkcs, year, city, school, cenci, tech, i))
if len(aa) ==3:
a = aa[0] +'/'+ aa[1]
cur.execute("insert into aaaaaa values (%s,%s,%s,%s,%s,%s,%s)",
(gkcs, year, city, school, cenci, tech, a))
b = aa[1] +'/'+ aa[2]
cur.execute("insert into aaaaaa values (%s,%s,%s,%s,%s,%s,%s)",
(gkcs, year, city, school, cenci, tech, b))
c = aa[0] +'/'+ aa[2]
cur.execute("insert into aaaaaa values (%s,%s,%s,%s,%s,%s,%s)",
(gkcs, year, city, school, cenci, tech, c))
for i in aa:
cur.execute("insert into aaaaaa values (%s,%s,%s,%s,%s,%s,%s)",
(gkcs, year, city, school, cenci, tech, i))
if ssss.find("+")>=0:
aa = ssss.split("+")
if len(aa) == 2:
for i in aa:
cur.execute("insert into aaaaaa values (%s,%s,%s,%s,%s,%s,%s)",
(gkcs, year, city, school, cenci, tech, i))
if len(aa) == 3:
a = aa[0] + '+' + aa[1]
cur.execute("insert into aaaaaa values (%s,%s,%s,%s,%s,%s,%s)",
(gkcs, year, city, school, cenci, tech, a))
b = aa[1] + '+' + aa[2]
cur.execute("insert into aaaaaa values (%s,%s,%s,%s,%s,%s,%s)",
(gkcs, year, city, school, cenci, tech, b))
c = aa[0] + '+' + aa[2]
cur.execute("insert into aaaaaa values (%s,%s,%s,%s,%s,%s,%s)",
(gkcs, year, city, school, cenci, tech, c))
for i in aa:
cur.execute("insert into aaaaaa values (%s,%s,%s,%s,%s,%s,%s)",
(gkcs, year, city, school, cenci, tech, i))
cur.close()
conn.commit()
conn.close()