爬虫时遇到的日期格式太多,统计一下日期的格式:
2018-6-21、2018年6月21、2018/6/21、21st Jun,2018、Jun 21st,2018(如果是其他月份还有缩写的形式)、Jun 21,2018、21-22 Jun 2018、Jun 21-22,2018、Thursday,21 Jun 2018(星期也可能会有缩写),还有未收录为了将日期格式统一,就需要解析。而网上的教程太少,就自己慢慢积累了。
import re
class DateFormatHelper(object):
regex1 = re.compile(r"[0-9]{1,2}-[0-9]{1,2} *[A-Za-z]+ *[0-9]{4}")
regex2 = re.compile(r"[0-9]{1,2} *[A-Za-z]+ *[0-9]{4}")
regex3 = re.compile(r"[0-9]{4}-[0-9]{1,2}-[0-9]{1,2}")
regex4 = re.compile(r"[A-Za-z]+ *[0-9]{1,2}, *[0-9]{4}")
regex5 = re.compile(r"[A-Za-z]+ *[0-9]{1,2}-[0-9]{1,2}, *[0-9]{4}")
regex6 = re.compile(r"[0-9]{4}年[0-9]{1,2}月[0-9]{1,2}日")
dateformatregexs = [regex1, regex2, regex3, regex4, regex5, regex6]
monthMap = {"sep": "9", "oct": "10", "nov": "11", "dec": "12", "jan": "1", "feb": "2",
"aug": "8", "jul": "7", "jun": "6", "may": "5", "apr": "4", "mar": "3"}
monthMap2 = {"September": "9", "October": "10", "November": "11", "December": "12",
"January": "1", "February": "2", "August": "8", "July": "7",
"June": "6", "May": "5", "April": "4", "March": "3"}
@classmethod
def convertStandardDateFormat(cls, datestr: str) -> str:
"""
转换日期格式
:param datestr:
:return:
"""
res = ""
if datestr is None:
return res
for i in range(0, len(cls.dateformatregexs)):
try:
regex = cls.dateformatregexs[i]
match = regex.match(datestr)
if match is not None:
itemstr = match.group()
if i == 0:
items = str(itemstr).split(" ")
year = items[len(items) - 1]
month = cls.monthMap.get(str(items[1]).lower())
if month is None:
month = cls.monthMap2.get(str(items[1]))
dayrange = str(items[0])
day = dayrange[0:dayrange.index("-")]
day2 = dayrange[dayrange.index("-") + 1:]
res = year + "-" + month + "-" + day
# res2 = year + "-" + month + "-" + str(day2)
elif i == 1:
items = str(itemstr).split(" ")
year = items[len(items) - 1]
month = cls.monthMap.get(str(items[1]).lower())
if month is None:
month = cls.monthMap2.get(str(items[1]))
day = items[0]
res = year + "-" + month + "-" + day
elif i == 3:
items = str(itemstr).split(" ")
year = items[len(items) - 1]
month = cls.monthMap.get(str(items[0]).lower())
if month is None:
month = cls.monthMap2.get(str(items[0]))
digit_pattern = re.compile(r'[0-9]+')
digitlist = digit_pattern.findall(items[1])
day = digitlist[0]
res = year + "-" + month + "-" + day
elif i == 4:
items = str(itemstr).split(" ")
year = items[len(items) - 1]
month = cls.monthMap.get(str(items[0]).lower())
if month is None:
month = cls.monthMap2.get(str(items[0]))
dayrange = str(items[1])
day = dayrange[0:dayrange.index("-")]
# day2 = dayrange[dayrange.index("-")+1:dayrange.index(",")]
res = year + "-" + month + "-" + day
# res2 = year + "-" + month + "-" + day2
elif i == 5:
for x in range(len(str(itemstr))):
if ord(itemstr[x]) > 255:
itemstr = itemstr.replace(itemstr[x], " ")
items = str(itemstr).split(" ")
year = items[0]
month = items[1]
day = items[2]
res = year + "-" + month + "-" + day
else:
res = datestr
print(res)
break
except Exception as e:
print("convertStandardDateFormat方法出现异常{}".format(e))
return res
convertStandardDateFormat()
这个函数,当然准确的说是需要时调用这个classmethod函数。然后,这个函数就会识别日期字符串符合哪种规定日期格式。之后就好办啦,知道哪个是年哪个是月哪个是日,然后就很简单的替换赋值的就成了。嗯,这个程序目前才只能解析寥寥几个格式,所以就放到网上,希望看到的人能继续解析,如果可以就在哪里回复你们写的解析程序的地址,我去转载过来继续完善。新手上路,有问题建议就留言什么的,不要喷啊...