URL normalization is the process by which URLs are modified and standardized in a consistent manner. The goal of the normalization process is to transform a URL into a normalized or canonical URL so it is possible to determine if two syntactically different URLs may be equivalent.
For our normalization we will use normalizations that preserve semantics. You should normalize a given url using the next rules (only these rules. They are slightly different from RFC).
HTTP://www.Example.com/ → http://www.example.com/
http://www.example.com/a%c2%b1b → http://www.example.com/a%C2%B1b
http://www.example.com/%7Eusername/ → http://www.example.com/~username/
http://www.example.com:80/bar.html → http://www.example.com/bar.html
http://www.example.com/a/b/../c/./d.html → http://www.example.com/a/c/d.html
Additional links: If you are interested to know more about URL normalization (This is not necessarily for this task), then you can find more information here: Wikipedia, RFC3986
Precondition: All input urls are valid.
Input: URL, an unicode string.
Output: Normalized URL, a string.
Example:
1
2
3
4
5
6
|
checkio( "Http://Www.Checkio.org" ) = = "http://www.checkio.org"
checkio( "http://www.checkio.org/%cc%b1bac" ) = = "http://www.checkio.org/%CC%B1bac"
checkio( "http://www.checkio.org/task%5F%31" ) = = "http://www.checkio.org/task_1"
checkio( "http://www.checkio.org:80/home/" ) = = "http://www.checkio.org/home/"
checkio( "http://www.checkio.org:8080/home/" ) = = "http://www.checkio.org:8080/home/"
checkio( "http://www.checkio.org/task/./1/../2/././name" ) = = "http://www.checkio.org/task/ 2/name
"
|
def checkio(url):
url = url.lower().replace(':8080', ':08080')
url = url.replace('%2d', '-').replace('%2e', '.').replace('%5f', '_').replace('%7e', '~').replace(':80', '').replace(':08080', ':8080')
data = url.split('%')
for i in range(len(data)):
try:
char = int(data[i][:2], 16)
if (char >= 65 and char <= 91) or (char >= 97 and char <= 122) or (char >= 48 and char <= 57):
data[i] = chr(char).lower() + data[i][2:]
else:
if i > 0:
data[i] = '%' + data[i][:2].upper() + data[i][2:]
except:
if i > 0:
data[i] = '%' + data[i][:2].upper() + data[i][2:]
url = ''.join(data)
ret = []
i = 0
while i < len(url):
if i+2 < len(url) and url[i:i+3] == '/..':
while(ret.pop(-1) != '/'):
pass
i += 3
elif i+1 < len(url) and url[i:i+2] == '/.':
i += 2
else:
ret.append(url[i])
i += 1
url = ''.join(ret)
return url
checkio("http://Example.com:80/%48%6f%6d%45")
checkio("http://example.com:80/HOME/../././Guest/1/../2/..")
if __name__ == '__main__':
assert checkio(u"Http://Www.Checkio.org") == \
"http://www.checkio.org", "1st rule"
assert checkio(u"http://www.checkio.org/%cc%b1bac") ==\
"http://www.checkio.org/%CC%B1bac", "2nd rule"
assert checkio(u"http://www.checkio.org/task%5F%31") == \
"http://www.checkio.org/task_1", "3rd rule"
assert checkio(u"http://www.checkio.org:80/home/") == \
"http://www.checkio.org/home/", "4th rule"
assert checkio(u"http://www.checkio.org:8080/home/") == \
"http://www.checkio.org:8080/home/", "4th rule again"
assert checkio(u"http://www.checkio.org/task/./1/../2/././name") == \
"http://www.checkio.org/task/2/name", "5th rule"
print('First set of tests done')
def checkio(url):
# 1. parse url
# 1.1. detach scheme
if ':' in url.split('/', 1)[0]:
scheme, url = url.split(':', 1)
else:
scheme = ''
# 1.2. detach fragment
if '#' in url:
url, fragment = url.rsplit('#', 1)
else:
fragment = ''
# 1.3. detach query
if '?' in url:
url, query = url.rsplit('?', 1)
else:
query = ''
# 1.4. detach authority
if url.startswith('//'):
url = url[2:]
if '/' in url:
authority, path = url.split('/', 1)
path = '/' + path
else:
authority, path = url, ''
else:
authority, path = '', url
# 1.5. detach userinfo
if '@' in authority:
userinfo, authority = authority.split('@', 1)
else:
userinfo = ''
# 1.6. split host and port
if ':' in authority:
host, port = authority.split(':', 1)
else:
host, port = authority, ''
# 1.7. split path segments
segments = path.split('/')
# 2. perform normalization
# 2.1. convert scheme and host to lowercase
scheme = scheme.lower()
host = host.lower()
# 2.1a. path segments should NOT be lowercased according to RFC 3986
# section 6.2.2.1. but the evaluator requires me to do that :S
segments = [s.lower() for s in segments]
# 2.2. process percent-encoded octets in userinfo, host and path segments
hexdigits = '0123456789ABCDEFabcdef'
alnum = ''.join(chr(i) for i in range(128) if chr(i).isalnum())
unreserved = alnum + '-._~'
# we associate a boolean to each escapable url part that tells whether we
# would like to lowercase unescaped characters (see also comment 2.1a.)
escapable = [(userinfo, False), (host, True)] + [(s, True) for s in segments]
processed = []
for s, lc in escapable:
t = s.split('%')
for i in range(1, len(t)):
# t[:2] is a possible percent-encoded octet (without the %)
if t[i][0] in hexdigits and t[i][1] in hexdigits:
c = chr(int(t[i][:2], 16))
if c in unreserved:
t[i] = (c.lower() if lc else c) + t[i][2:]
else:
t[i] = '%' + t[i][:2].upper() + t[i][2:]
else:
t[i] = '%' + t[i] # invalid percent encoding, ignore it
processed.append(''.join(t))
userinfo, host, *segments = processed
# 2.3. remove default port
defports = {
'ftp': 21, 'gopher': 70, 'http': 80, 'https': 443, 'ldap': 389,
'ldaps': 636, 'mms': 1755, 'news': 119, 'pop': 110, 'rlogin': 513,
'rsync': 873, 'rtsp': 554, 'rtspu': 554, 'sip': 5060, 'sips': 5061,
'snews': 563, 'ssh': 22, 'telnet': 23, 'tn3270': 23
}
if scheme in defports and port == str(defports[scheme]):
port = ''
# 2.4. remove dot-segments
processed = []
for s in segments:
if s == '.':
pass
elif s == '..':
processed and processed.pop() # remove last segment if it exists
else:
processed.append(s)
segments = processed
# 3. recompose url
# 3.1. join path
path = '/'.join(segments)
# 3.2. reconstruct authority
authority = host
if port:
authority += ':' + port
if userinfo:
authority = userinfo + authority
# 3.3. build complete url
url = ''
if scheme:
url += scheme + ':'
if authority:
url += '//' + authority
url += path
if query:
url += '?' + query
if fragment:
url += '#' + fragment
return url