cs100/lab3
"id","title","description","manufacturer","price"
"id","name","description","manufacturer","price"
import re
DATAFILE_PATTERN = '^(.+),"(.+)",(.*),(.*),(.*)'
def removeQuotes(s):
""" Remove quotation marks from an input string
Args:
s (str): input string that might have the quote "" characters
Returns:
str: a string without the quote characters
"""
return ''.join(i for i in s if i!='"')
def parseDatafileLine(datafileLine):
""" Parse a line of the data file using the specified regular expression pattern
Args:
datafileLine (str): input string that is a line from the data file
Returns:
str: a string parsed using the given regular expression and without the quote characters
"""
match = re.search(DATAFILE_PATTERN, datafileLine)
if match is None:
print 'Invalid datafile line: %s' % datafileLine
return (datafileLine, -1)
elif match.group(1) == '"id"':
print 'Header datafile line: %s' % datafileLine
return (datafileLine, 0)
else:
product = '%s %s %s' % (match.group(2), match.group(3), match.group(4))
return ((removeQuotes(match.group(1)), product), 1)
import sys
import os
from test_helper import Test
baseDir = os.path.join('data')
inputPath = os.path.join('cs100', 'lab3')
GOOGLE_PATH = 'Google.csv'
GOOGLE_SMALL_PATH = 'Google_small.csv'
AMAZON_PATH = 'Amazon.csv'
AMAZON_SMALL_PATH = 'Amazon_small.csv'
GOLD_STANDARD_PATH = 'Amazon_Google_perfectMapping.csv'
STOPWORDS_PATH = 'stopwords.txt'
def parseData(filename):
""" Parse a data file
Args:
filename (str): input file name of the data file
Returns:
RDD: a RDD of parsed lines
"""
return (sc
.textFile(filename, 4, 0)
.map(parseDatafileLine)
.cache())
def loadData(path):
""" Load a data file
Args:
path (str): input file name of the data file
Returns:
RDD: a RDD of parsed valid lines
"""
filename = os.path.join(baseDir, inputPath, path)
raw = parseData(filename).cache()
failed = (raw
.filter(lambda s: s[1] == -1)
.map(lambda s: s[0]))
for line in failed.take(10):
print '%s - Invalid datafile line: %s' % (path, line)
valid = (raw
.filter(lambda s: s[1] == 1)
.map(lambda s: s[0])
.cache())
print '%s - Read %d lines, successfully parsed %d lines, failed to parse %d lines' % (path,
raw.count(),
valid.count(),
failed.count())
assert failed.count() == 0
assert raw.count() == (valid.count() + 1)
return valid
googleSmall = loadData(GOOGLE_SMALL_PATH)
google = loadData(GOOGLE_PATH)
amazonSmall = loadData(AMAZON_SMALL_PATH)
amazon = loadData(AMAZON_PATH)
Google_small.csv - Read 201 lines, successfully parsed 200 lines, failed to parse 0 lines
Google.csv - Read 3227 lines, successfully parsed 3226 lines, failed to parse 0 lines
Amazon_small.csv - Read 201 lines, successfully parsed 200 lines, failed to parse 0 lines
Amazon.csv - Read 1364 lines, successfully parsed 1363 lines, failed to parse 0 lines
for line in googleSmall.take(3):
print 'google: %s: %s\n' % (line[0], line[1])
for line in amazonSmall.take(3):
print 'amazon: %s: %s\n' % (line[0], line[1])
google: http://www.google.com/base/feeds/snippets/11448761432933644608: spanish vocabulary builder "expand your vocabulary! contains fun lessons that both teach and entertain you'll quickly find yourself mastering new terms. includes games and more!"
google: http://www.google.com/base/feeds/snippets/8175198959985911471: topics presents: museums of world "5 cd-rom set. step behind the velvet rope to examine some of the most treasured collections of antiquities art and inventions. includes the following the louvre - virtual visit 25 rooms in full screen interactive video detailed map of the louvre ..."
google: http://www.google.com/base/feeds/snippets/18445827127704822533: sierrahome hse hallmark card studio special edition win 98 me 2000 xp "hallmark card studio special edition (win 98 me 2000 xp)" "sierrahome"
amazon: b000jz4hqo: clickart 950 000 - premier image pack (dvd-rom) "broderbund"
amazon: b0006zf55o: ca international - arcserve lap/desktop oem 30pk "oem arcserve backup v11.1 win 30u for laptops and desktops" "computer associates"
amazon: b00004tkvy: noah's ark activity center (jewel case ages 3-8) "victory multimedia"
Note on terminology: a “token” is the result of parsing the document down to the elements we consider “atomic” for the task at hand. Tokens can be things like words, numbers, acronyms, or other exotica like word-roots or fixed-length character strings.
Bag of words techniques all apply to any sort of token, so when we say “bag-of-words” we really mean “bag-of-tokens,” strictly speaking.
simpleTokenize(string)
that takes a string and returns a list of non-empty tokens in the string. simpleTokenize
should split strings using the provided regular expression. Since we want to make token-matching case insensitive, make sure all tokens are turned lower-case. Give an interpretation, in natural language, of what the regular expression, split_regex
, matches.# TODO: Replace with appropriate code
quickbrownfox = 'A quick brown fox jumps over the lazy dog.'
split_regex = r'\W+'
def simpleTokenize(string):
""" A simple implementation of input string tokenization
Args:
string (str): input string
Returns:
list: a list of tokens
"""
lstr = string.lower()
return [s for s in re.split(split_regex,lstr) if s != '']
print simpleTokenize(quickbrownfox) # Should give ['a', 'quick', 'brown', ... ]
['a', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
# TEST Tokenize a String (1a)
Test.assertEquals(simpleTokenize(quickbrownfox),
['a','quick','brown','fox','jumps','over','the','lazy','dog'],
'simpleTokenize should handle sample text')
Test.assertEquals(simpleTokenize(' '), [], 'simpleTokenize should handle empty string')
Test.assertEquals(simpleTokenize('!!!!123A/456_B/789C.123A'), ['123a','456_b','789c','123a'],
'simpleTokenize should handle puntuations and lowercase result')
Test.assertEquals(simpleTokenize('fox fox'), ['fox', 'fox'],
'simpleTokenize should not remove duplicates')
1 test passed.
1 test passed.
1 test passed.
1 test passed.
tokenize
, an improved tokenizer that does not emit stopwords.# TODO: Replace with appropriate code
stopfile = os.path.join(baseDir, inputPath, STOPWORDS_PATH)
stopwords = set(sc.textFile(stopfile).collect())
print 'These are the stopwords: %s' % stopwords
def tokenize(string):
""" An implementation of input string tokenization that excludes stopwords
Args:
string (str): input string
Returns:
list: a list of tokens without stopwords
"""
return filter(lambda s:s not in stopwords,simpleTokenize(string))
print tokenize(quickbrownfox) # Should give ['quick', 'brown', ... ]
These are the stopwords: set([u'all', u'just', u'being', u'over', u'both', u'through', u'yourselves', u'its', u'before', u'with', u'had', u'should', u'to', u'only', u'under', u'ours', u'has', u'do', u'them', u'his', u'very', u'they', u'not', u'during', u'now', u'him', u'nor', u'did', u'these', u't', u'each', u'where', u'because', u'doing', u'theirs', u'some', u'are', u'our', u'ourselves', u'out', u'what', u'for', u'below', u'does', u'above', u'between', u'she', u'be', u'we', u'after', u'here', u'hers', u'by', u'on', u'about', u'of', u'against', u's', u'or', u'own', u'into', u'yourself', u'down', u'your', u'from', u'her', u'whom', u'there', u'been', u'few', u'too', u'themselves', u'was', u'until', u'more', u'himself', u'that', u'but', u'off', u'herself', u'than', u'those', u'he', u'me', u'myself', u'this', u'up', u'will', u'while', u'can', u'were', u'my', u'and', u'then', u'is', u'in', u'am', u'it', u'an', u'as', u'itself', u'at', u'have', u'further', u'their', u'if', u'again', u'no', u'when', u'same', u'any', u'how', u'other', u'which', u'you', u'who', u'most', u'such', u'why', u'a', u'don', u'i', u'having', u'so', u'the', u'yours', u'once'])
['quick', 'brown', 'fox', 'jumps', 'lazy', 'dog']
# TEST Removing stopwords (1b)
Test.assertEquals(tokenize("Why a the?"), [], 'tokenize should remove all stopwords')
Test.assertEquals(tokenize("Being at the_?"), ['the_'], 'tokenize should handle non-stopwords')
Test.assertEquals(tokenize(quickbrownfox), ['quick','brown','fox','jumps','lazy','dog'],
'tokenize should handle sample text')
1 test passed.
1 test passed.
1 test passed.
tokenize
the values, and then count the total number of tokens.# TODO: Replace with appropriate code
amazonRecToToken = amazonSmall.map(lambda line:(line[0],tokenize(line[1])))
googleRecToToken = googleSmall.map(lambda line:(line[0],tokenize(line[1])))
def countTokens(vendorRDD):
""" Count and return the number of tokens
Args:
vendorRDD (RDD of (recordId, tokenizedValue)): Pair tuple of record ID to tokenized output
Returns:
count: count of all tokens
"""
return vendorRDD.map(lambda line:len(line[1])).reduce(lambda a,b:a+b)
totalTokens = countTokens(amazonRecToToken) + countTokens(googleRecToToken)
print 'There are %s tokens in the combined datasets' % totalTokens
There are 22520 tokens in the combined datasets
# TEST Tokenizing the small datasets (1c)
Test.assertEquals(totalTokens, 22520, 'incorrect totalTokens')
1 test passed.
# TODO: Replace with appropriate code
def findBiggestRecord(vendorRDD):
""" Find and return the record with the largest number of tokens
Args:
vendorRDD (RDD of (recordId, tokens)): input Pair Tuple of record ID and tokens
Returns:
list: a list of 1 Pair Tuple of record ID and tokens
"""
return [vendorRDD.max(key=lambda (k,v):len(v))]
biggestRecordAmazon = findBiggestRecord(amazonRecToToken)
print 'The Amazon record with ID "%s" has the most tokens (%s)' % (biggestRecordAmazon[0][0],
len(biggestRecordAmazon[0][1]))
The Amazon record with ID "b000o24l3q" has the most tokens (1547)
# TEST Amazon record with the most tokens (1d)
Test.assertEquals(biggestRecordAmazon[0][0], 'b000o24l3q', 'incorrect biggestRecordAmazon')
Test.assertEquals(len(biggestRecordAmazon[0][1]), 1547, 'incorrect len for biggestRecordAmazon')
1 test passed.
1 test passed.
Note on terminology: Sometimes token weights depend on the document the token belongs to, that is, the same token may have a different weight when it’s found in different documents. We call these weights local weights. TF is an example of a local weight, because it depends on the length of the source. On the other hand, some token weights only depend on the token, and are the same everywhere that token is found. We call these weights global, and IDF is one such weight.
tf(tokens)
that takes a list of tokens and returns a Python dictionary mapping tokens to TF weights.tokens
list, count 1 for each occurance and add the token to the dictionarytokens
listdef tf(tokens):
“”” Compute TF
Args:
tokens (list of str): input list of tokens from tokenize
Returns:
dictionary: a dictionary of tokens to its TF values
“””
d = {}
for s in tokens:
if s in d:
d[s] += 1.0
else:
d[s] = 1.0
return {s:d[s]/len(tokens) for s in d}
print tf(tokenize(quickbrownfox)) # Should give { ‘quick’: 0.1666 … }
{‘brown’: 0.16666666666666666, ‘lazy’: 0.16666666666666666, ‘jumps’: 0.16666666666666666, ‘fox’: 0.16666666666666666, ‘dog’: 0.16666666666666666, ‘quick’: 0.16666666666666666}
# TEST Implement a TF function (2a)
tf_test = tf(tokenize(quickbrownfox))
Test.assertEquals(tf_test, {‘brown’: 0.16666666666666666, ‘lazy’: 0.16666666666666666,
‘jumps’: 0.16666666666666666, ‘fox’: 0.16666666666666666,
‘dog’: 0.16666666666666666, ‘quick’: 0.16666666666666666},
‘incorrect result for tf on sample text’)
tf_test2 = tf(tokenize(‘one_ one_ two!’))
Test.assertEquals(tf_test2, {‘one_’: 0.6666666666666666, ‘two’: 0.3333333333333333},
‘incorrect result for tf test’)
1 test passed.
1 test passed.
corpusRDD
, consisting of a combination of the two small datasets, amazonRecToToken
and googleRecToToken
. Each element of the corpusRDD
should be a pair consisting of a key from one of the small datasets (ID or URL) and the value is the associated value for that key from the small datasets.# TODO: Replace with appropriate code
corpusRDD = amazonRecToToken.union(googleRecToToken)# amazonRecToToken.fullOuterJoin(googleRecToToken).map(lambda (k,v):(k,v[0]if v[0]is not None else v[1]))
# TEST Create a corpus (2b)
Test.assertEquals(corpusRDD.count(), 400, 'incorrect corpusRDD.count()')
#print corpusRDD.first()
1 test passed.
idfs
that assigns an IDF weight to every unique token in an RDD called corpus
. The function should return an pair RDD where the key
is the unique token and value is the IDF weight for the token.corpus
. For each document, you should only include a token once, even if it appears multiple times in that document.idfs
to compute the IDF weights for all tokens in corpusRDD
(the combined small datasets).# TODO: Replace with appropriate code
def idfs(corpus):
""" Compute IDF
Args:
corpus (RDD): input corpus
Returns:
RDD: a RDD of (token, IDF value)
"""
N = corpus.count()
uniqueTokens = corpus.flatMap(lambda (k,v):[(s,k) for s in v])
tokenCountPairTuple = uniqueTokens.groupByKey()
tokenSumPairTuple = tokenCountPairTuple.map(lambda (k,v):(k,len(set(v))))
return (tokenSumPairTuple.map(lambda (k,v):(k,N*1.0/v)))
idfsSmall = idfs(amazonRecToToken.union(googleRecToToken))
uniqueTokenCount = idfsSmall.count()
print 'There are %s unique tokens in the small datasets.' % uniqueTokenCount
There are 4772 unique tokens in the small datasets.
# TEST Implement an IDFs function (2c)
Test.assertEquals(uniqueTokenCount, 4772, 'incorrect uniqueTokenCount')
tokenSmallestIdf = idfsSmall.takeOrdered(1, lambda s: s[1])[0]
Test.assertEquals(tokenSmallestIdf[0], 'software', 'incorrect smallest IDF token')
Test.assertTrue(abs(tokenSmallestIdf[1] - 4.25531914894) < 0.0000000001,
'incorrect smallest IDF value')
1 test passed.
1 test passed.
1 test passed.
smallIDFTokens = idfsSmall.takeOrdered(11, lambda s: s[1])
print smallIDFTokens
[('software', 4.25531914893617), ('new', 6.896551724137931), ('features', 6.896551724137931), ('use', 7.017543859649122), ('complete', 7.2727272727272725), ('easy', 7.6923076923076925), ('create', 8.333333333333334), ('system', 8.333333333333334), ('cd', 8.333333333333334), ('1', 8.51063829787234), ('windows', 8.51063829787234)]
matplotlib
import matplotlib.pyplot as plt
small_idf_values = idfsSmall.map(lambda s: s[1]).collect()
fig = plt.figure(figsize=(8,3))
plt.hist(small_idf_values, 50, log=True)
pass
tf
function to implement a tfidf(tokens, idfs)
function that takes a list of tokens from a document and a Python dictionary of IDF weights and returns a Python dictionary mapping individual tokens to total TF-IDF weights.tokens
tfidf
function to compute the weights of Amazon product record ‘b000hkgj8k’. To do this, we need to extract the record for the token from the tokenized small Amazon dataset and we need to convert the IDFs for the small dataset into a Python dictionary. We can do the first part, by using a filter()
transformation to extract the matching record and a collect()
action to return the value to the driver. For the second part, we use the collectAsMap()
action to return the IDFs to the driver as a Python dictionary.# TODO: Replace with appropriate code
def tfidf(tokens, idfs):
""" Compute TF-IDF
Args:
tokens (list of str): input list of tokens from tokenize
idfs (dictionary): record to IDF value
Returns:
dictionary: a dictionary of records to TF-IDF values
"""
tfs = tf(tokens)
tfIdfDict = {t:tfs[t]*idfs[t] for t in tfs}
return tfIdfDict
recb000hkgj8k = amazonRecToToken.filter(lambda x: x[0] == 'b000hkgj8k').collect()[0][1]
idfsSmallWeights = idfsSmall.collectAsMap() # mark
rec_b000hkgj8k_weights = tfidf(recb000hkgj8k, idfsSmallWeights)
print 'Amazon record "b000hkgj8k" has tokens and weights:\n%s' % rec_b000hkgj8k_weights
Amazon record "b000hkgj8k" has tokens and weights:
{'autocad': 33.33333333333333, 'autodesk': 8.333333333333332, 'courseware': 66.66666666666666, 'psg': 33.33333333333333, '2007': 3.5087719298245617, 'customizing': 16.666666666666664, 'interface': 3.0303030303030303}
# TEST Implement a TF-IDF function (2f)
Test.assertEquals(rec_b000hkgj8k_weights,
{'autocad': 33.33333333333333, 'autodesk': 8.333333333333332,
'courseware': 66.66666666666666, 'psg': 33.33333333333333,
'2007': 3.5087719298245617, 'customizing': 16.666666666666664,
'interface': 3.0303030303030303}, 'incorrect rec_b000hkgj8k_weights')
1 test passed.
cosineSimilarity
functioncosineSimilarity
function.tokenize
and tfidf
functions, and the IDF weights from Part 2 for extracting tokens and assigning them weights.dotprod
that takes two Python dictionaries and produces the dot product of them, where the dot product is defined as the sum of the product of values for tokens that appear in both dictionariesnorm
that returns the square root of the dot product of a dictionary and itselfcossim
that returns the dot product of two dictionaries divided by the norm of the first dictionary and then by the norm of the second dictionary# TODO: Replace with appropriate code
import math
def dotprod(a, b):
“”” Compute dot product
Args:
a (dictionary): first dictionary of record to value
b (dictionary): second dictionary of record to value
Returns:
dotProd: result of the dot product with the two input dictionaries
“””
return sum(a[k]*b[k]for k in a if k in b)
def norm(a):
“”” Compute square root of the dot product
Args:
a (dictionary): a dictionary of record to value
Returns:
norm: a dictionary of tokens to its TF values
“””
return math.sqrt(sum(a[k]**2 for k in a))
def cossim(a, b):
“”” Compute cosine similarity
Args:
a (dictionary): first dictionary of record to value
b (dictionary): second dictionary of record to value
Returns:
cossim: dot product of two dictionaries divided by the norm of the first dictionary and
then by the norm of the second dictionary
“””
return dotprod(a,b)/(norm(a)*norm(b))
testVec1 = {‘foo’: 2, ‘bar’: 3, ‘baz’: 5 }
testVec2 = {‘foo’: 1, ‘bar’: 0, ‘baz’: 20 }
dp = dotprod(testVec1, testVec2)
nm = norm(testVec1)
print dp, nm
102 6.16441400297
# TEST Implement the components of a cosineSimilarity function (3a)
Test.assertEquals(dp, 102, ‘incorrect dp’)
Test.assertTrue(abs(nm - 6.16441400297) < 0.0000001, ‘incorrrect nm’)
1 test passed.
1 test passed.
cosineSimilarity
functioncosineSimilarity(string1, string2, idfsDictionary)
function that takes two strings and a dictionary of IDF weights, and computes their cosine similarity in the context of some global IDF weights.tfidf
function to the tokenized first and second strings, using the dictionary of IDF weightscossim
function applied to the results of the two tfidf
functions# TODO: Replace with appropriate code
def cosineSimilarity(string1, string2, idfsDictionary):
“”” Compute cosine similarity between two strings
Args:
string1 (str): first string
string2 (str): second string
idfsDictionary (dictionary): a dictionary of IDF values
Returns:
cossim: cosine similarity value
“””
w1 = tfidf(tokenize(string1),idfsDictionary)
w2 = tfidf(tokenize(string2),idfsDictionary)
return cossim(w1, w2)
cossimAdobe = cosineSimilarity(‘Adobe Photoshop’,
‘Adobe Illustrator’,
idfsSmallWeights)
print cossimAdobe
0.0577243382163
# TEST Implement a cosineSimilarity function (3b)
Test.assertTrue(abs(cossimAdobe - 0.0577243382163) < 0.0000001, ‘incorrect cossimAdobe’)
1 test passed.
cosineSimilarity
function to compute its similarity to every record in the small Amazon dataset. Then, build a dictionary mapping (Google URL, Amazon ID)
tuples to similarity scores between 0 and 1.[ ((Google URL1, Google String1), (Amazon ID1, Amazon String1)), ((Google URL1, Google String1), (Amazon ID2, Amazon String2)), ((Google URL2, Google String2), (Amazon ID1, Amazon String1)), ... ]
b000o24l3q
and Google record http://www.google.com/base/feeds/snippets/17242822440574356561
.# TODO: Replace with appropriate code
crossSmall = (googleSmall
.cartesian(amazonSmall)
.cache())
def computeSimilarity(record):
""" Compute similarity on a combination record
Args:
record: a pair, (google record, amazon record)
Returns:
pair: a pair, (google URL, amazon ID, cosine similarity value)
"""
googleRec = record[0]
amazonRec = record[1]
googleURL = googleRec[0]
amazonID = amazonRec[0]
googleValue = googleRec[1]
amazonValue = amazonRec[1]
cs = cosineSimilarity(googleValue,amazonValue,idfsSmallWeights)
return (googleURL, amazonID, cs)
similarities = (crossSmall
.map(lambda line:computeSimilarity(line))
.cache())
def similar(amazonID, googleURL):
""" Return similarity value
Args:
amazonID: amazon ID
googleURL: google URL
Returns:
similar: cosine similarity value
"""
return (similarities
.filter(lambda record: (record[0] == googleURL and record[1] == amazonID))
.collect()[0][2])
similarityAmazonGoogle = similar('b000o24l3q', 'http://www.google.com/base/feeds/snippets/17242822440574356561')
print 'Requested similarity is %s.' % similarityAmazonGoogle
Requested similarity is 0.000303171940451.
# TEST Perform Entity Resolution (3c)
Test.assertTrue(abs(similarityAmazonGoogle - 0.000303171940451) < 0.0000001,
'incorrect similarityAmazonGoogle')
1 test passed.
idfsSmallWeights
variable to all the workers. If we didn’t cache()
similarities, then it might have to be recreated if we run similar()
multiple times. This would cause Spark to send idfsSmallWeights
every time.computeSimilarityBroadcast
function that given an element from the combination RDD computes the cosine simlarity for the two records in the element. This will be the same as the worker function computeSimilarity
in (3c) except that it uses a broadcast variable.b000o24l3q
and Google record http://www.google.com/base/feeds/snippets/17242822440574356561
.# TODO: Replace with appropriate code
def computeSimilarityBroadcast(record):
""" Compute similarity on a combination record, using Broadcast variable
Args:
record: a pair, (google record, amazon record)
Returns:
pair: a pair, (google URL, amazon ID, cosine similarity value)
"""
googleRec = record[0]
amazonRec = record[1]
googleURL = googleRec[0]
amazonID = amazonRec[0]
googleValue = googleRec[1]
amazonValue = amazonRec[1]
cs = cosineSimilarity(googleValue,amazonValue,idfsSmallBroadcast.value)
return (googleURL, amazonID, cs)
idfsSmallBroadcast = sc.broadcast(idfsSmallWeights)
similaritiesBroadcast = (crossSmall
.map(lambda record:computeSimilarityBroadcast(record))
.cache())
def similarBroadcast(amazonID, googleURL):
""" Return similarity value, computed using Broadcast variable
Args:
amazonID: amazon ID
googleURL: google URL
Returns:
similar: cosine similarity value
"""
return (similaritiesBroadcast
.filter(lambda record: (record[0] == googleURL and record[1] == amazonID))
.collect()[0][2])
similarityAmazonGoogleBroadcast = similarBroadcast('b000o24l3q', 'http://www.google.com/base/feeds/snippets/17242822440574356561')
print 'Requested similarity is %s.' % similarityAmazonGoogleBroadcast
Requested similarity is 0.000303171940451.
# TEST Perform Entity Resolution with Broadcast Variables (3d)
from pyspark import Broadcast
Test.assertTrue(isinstance(idfsSmallBroadcast, Broadcast), 'incorrect idfsSmallBroadcast')
Test.assertEquals(len(idfsSmallBroadcast.value), 4772, 'incorrect idfsSmallBroadcast value')
Test.assertTrue(abs(similarityAmazonGoogleBroadcast - 0.000303171940451) < 0.0000001,
'incorrect similarityAmazonGoogle')
1 test passed.
1 test passed.
1 test passed.
GOLDFILE_PATTERN = '^(.+),(.+)'
# Parse each line of a data file useing the specified regular expression pattern
def parse_goldfile_line(goldfile_line):
""" Parse a line from the 'golden standard' data file
Args:
goldfile_line: a line of data
Returns:
pair: ((key, 'gold', 1 if successful or else 0))
"""
match = re.search(GOLDFILE_PATTERN, goldfile_line)
if match is None:
print 'Invalid goldfile line: %s' % goldfile_line
return (goldfile_line, -1)
elif match.group(1) == '"idAmazon"':
print 'Header datafile line: %s' % goldfile_line
return (goldfile_line, 0)
else:
key = '%s %s' % (removeQuotes(match.group(1)), removeQuotes(match.group(2)))
return ((key, 'gold'), 1)
goldfile = os.path.join(baseDir, inputPath, GOLD_STANDARD_PATH)
gsRaw = (sc
.textFile(goldfile)
.map(parse_goldfile_line)
.cache())
gsFailed = (gsRaw
.filter(lambda s: s[1] == -1)
.map(lambda s: s[0]))
for line in gsFailed.take(10):
print 'Invalid goldfile line: %s' % line
goldStandard = (gsRaw
.filter(lambda s: s[1] == 1)
.map(lambda s: s[0])
.cache())
print 'Read %d lines, successfully parsed %d lines, failed to parse %d lines' % (gsRaw.count(),
goldStandard.count(),
gsFailed.count())
assert (gsFailed.count() == 0)
assert (gsRaw.count() == (goldStandard.count() + 1))
Read 1301 lines, successfully parsed 1300 lines, failed to parse 0 lines
sims
RDD from the similaritiesBroadcast
RDD, where each element consists of a pair of the form (“AmazonID GoogleURL”, cosineSimilarityScore). An example entry from sims
is: (‘b000bi7uqs http://www.google.com/base/feeds/snippets/18403148885652932189‘, 0.40202896125621296)sims
RDD with the goldStandard
RDD by creating a new trueDupsRDD
RDD that has the just the cosine similarity scores for those “AmazonID GoogleURL” pairs that appear in both the sims
RDD and goldStandard
RDD. Hint: you can do this using the join() transformation.trueDupsRDD
datasettrueDupsRDD
datasets. Remember to use float
for calculationnonDupsRDD
RDD that has the just the cosine similarity scores for those “AmazonID GoogleURL” pairs from the similaritiesBroadcast
RDD that do not appear in both the sims RDD and gold standard RDD.float
for calculation# TODO: Replace with appropriate code
sims = similaritiesBroadcast.map(lambda line:(‘%s %s’%(line[1],line[0]),line[2]))
trueDupsRDD = (sims.join(goldStandard))
trueDupsCount = trueDupsRDD.count()
avgSimDups = trueDupsRDD.map(lambda (k,v):v[0]).mean()
nonDupsRDD = (sims
.leftOuterJoin(goldStandard).map(lambda (k,v):v[0] if v[1] is None else -1)).filter(lambda v:v!=-1)
avgSimNon = nonDupsRDD.mean()
print ‘There are %s true duplicates.’ % trueDupsCount
print ‘The average similarity of true duplicates is %s.’ % avgSimDups
print ‘And for non duplicates, it is %s.’ % avgSimNon
There are 146 true duplicates.
The average similarity of true duplicates is 0.264332573435.
And for non duplicates, it is 0.00123476304656.
# TEST Perform a Gold Standard evaluation (3e)
Test.assertEquals(trueDupsCount, 146, ‘incorrect trueDupsCount’)
Test.assertTrue(abs(avgSimDups - 0.264332573435) < 0.0000001, ‘incorrect avgSimDups’)
Test.assertTrue(abs(avgSimNon - 0.00123476304656) < 0.0000001, ‘incorrect avgSimNon’)
1 test passed.
1 test passed.
1 test passed.
Note on terminology: In text search, a forward index maps documents in a dataset to the tokens they contain. An inverted index supports the inverse mapping.
Note: For this section, use the complete Google and Amazon datasets, not the samples
# TODO: Replace with appropriate code
amazonFullRecToToken = amazon.map(lambda (k,v):(k,tokenize(v)))
googleFullRecToToken = google.map(lambda (k,v):(k,tokenize(v)))
print 'Amazon full dataset is %s products, Google full dataset is %s products' % (amazonFullRecToToken.count(),
googleFullRecToToken.count())
Amazon full dataset is 1363 products, Google full dataset is 3226 products
# TEST Tokenize the full dataset (4a)
Test.assertEquals(amazonFullRecToToken.count(), 1363, 'incorrect amazonFullRecToToken.count()')
Test.assertEquals(googleFullRecToToken.count(), 3226, 'incorrect googleFullRecToToken.count()')
1 test passed.
1 test passed.
fullCorpusRDD
that contains the tokens from the full Amazon and Google datasets.idfs
function to the fullCorpusRDD
# TODO: Replace with appropriate code
fullCorpusRDD = amazonFullRecToToken.union(googleFullRecToToken)
idfsFull = idfs(fullCorpusRDD)
idfsFullCount = idfsFull.count()
print ‘There are %s unique tokens in the full datasets.’ % idfsFullCount
# Recompute IDFs for full dataset
idfsFullWeights = idfsFull.collectAsMap()
idfsFullBroadcast = sc.broadcast(idfsFullWeights)
# Pre-compute TF-IDF weights. Build mappings from record ID weight vector.
amazonWeightsRDD = amazonFullRecToToken.map(lambda (k,v):(k,tfidf(v, idfsFullBroadcast.value)))
googleWeightsRDD = googleFullRecToToken.map(lambda (k,v):(k,tfidf(v, idfsFullBroadcast.value)))
print ‘There are %s Amazon weights and %s Google weights.’ % (amazonWeightsRDD.count(),
googleWeightsRDD.count())
There are 17078 unique tokens in the full datasets.
There are 1363 Amazon weights and 3226 Google weights.
# TEST Compute IDFs and TF-IDFs for the full datasets (4b)
Test.assertEquals(idfsFullCount, 17078, ‘incorrect idfsFullCount’)
Test.assertEquals(amazonWeightsRDD.count(), 1363, ‘incorrect amazonWeightsRDD.count()’)
Test.assertEquals(googleWeightsRDD.count(), 3226, ‘incorrect googleWeightsRDD.count()’)
1 test passed.
1 test passed.
1 test passed.
# TODO: Replace with appropriate code
amazonNorms = amazonWeightsRDD.map(lambda (k,d):(k,norm(d)))
amazonNormsBroadcast = sc.broadcast(amazonNorms.collectAsMap())
googleNorms = googleWeightsRDD.map(lambda (k,d):(k,norm(d)))
googleNormsBroadcast = sc.broadcast(googleNorms.collectAsMap())
# TEST Compute Norms for the weights from the full datasets (4c)
Test.assertTrue(isinstance(amazonNormsBroadcast, Broadcast), ‘incorrect amazonNormsBroadcast’)
Test.assertEquals(len(amazonNormsBroadcast.value), 1363, ‘incorrect amazonNormsBroadcast.value’)
Test.assertTrue(isinstance(googleNormsBroadcast, Broadcast), ‘incorrect googleNormsBroadcast’)
Test.assertEquals(len(googleNormsBroadcast.value), 3226, ‘incorrect googleNormsBroadcast.value’)
1 test passed.
1 test passed.
1 test passed.
1 test passed.
def invert(record):
“”” Invert (ID, tokens) to a list of (token, ID)
Args:
record: a pair, (ID, token vector)
Returns:
pairs: a list of pairs of token to ID
“””
ID, tokenvector = record
return [(k,ID) for k in tokenvector]
amazonInvPairsRDD = (amazonWeightsRDD
.flatMap(invert)
.cache())
googleInvPairsRDD = (googleWeightsRDD
.flatMap(invert)
.cache())
print ‘There are %s Amazon inverted pairs and %s Google inverted pairs.’ % (amazonInvPairsRDD.count(),
googleInvPairsRDD.count())
There are 111387 Amazon inverted pairs and 77678 Google inverted pairs.
# TEST Create inverted indicies from the full datasets (4d)
invertedPair = invert((1, {‘foo’: 2}))
Test.assertEquals(invertedPair[0][1], 1, ‘incorrect invert result’)
Test.assertEquals(amazonInvPairsRDD.count(), 111387, ‘incorrect amazonInvPairsRDD.count()’)
Test.assertEquals(googleInvPairsRDD.count(), 77678, ‘incorrect googleInvPairsRDD.count()’)
1 test passed.
1 test passed.
1 test passed.
""" Swap (token, (ID, URL)) to ((ID, URL), token)
Args:
record: a pair, (token, (ID, URL))
Returns:
pair: ((ID, URL), token)
"""
token = record[0]
keys = record[1]
return (keys, token)
commonTokens = (amazonInvPairsRDD
.join(googleInvPairsRDD)
.map(swap)
.groupByKey()
.cache())
print ‘Found %d common tokens’ % commonTokens.count()
Found 2441100 common tokens
# TEST Identify common tokens from the full dataset (4e)
Test.assertEquals(commonTokens.count(), 2441100, ‘incorrect commonTokens.count()’)
[((‘b00005lzly’, ‘http://www.google.com/base/feeds/snippets/13823221823254120257‘),
fastCosinesSimilarity
function that takes in a record consisting of the pair ((Amazon ID, Google URL), tokens list) and computes the sum for each of the tokens in the token list of the products of the Amazon weight for the token times the Google weight for the token. The sum should then be divided by the norm for the Google URL and then divided by the norm for the Amazon ID. The function should return this value in a pair with the key being the (Amazon ID, Google URL). Make sure you use broadcast variables you created for both the weights and normsfastCosinesSimilarity
function to the common tokens from the full dataset# TODO: Replace with appropriate code
amazonWeightsBroadcast = sc.broadcast(amazonWeightsRDD.collectAsMap())
googleWeightsBroadcast = sc.broadcast(googleWeightsRDD.collectAsMap())
def fastCosineSimilarity(record):
“”” Compute Cosine Similarity using Broadcast variables
Args:
record: ((ID, URL), token)
Returns:
pair: ((ID, URL), cosine similarity value)
“””
amazonRec = record[0][0]
googleRec = record[0][1]
tokens = record[1]
value = sum((amazonWeightsBroadcast.value[amazonRec][t])*(googleWeightsBroadcast.value[googleRec][t])\
for t in tokens if t in amazonWeightsBroadcast.value[amazonRec] and t in googleWeightsBroadcast.value[googleRec])\
/((amazonNormsBroadcast.value[amazonRec])*(googleNormsBroadcast.value[googleRec]))
key = (amazonRec, googleRec)
return (key, value)
similaritiesFullRDD = (commonTokens
.map(fastCosineSimilarity)
.cache())
print similaritiesFullRDD.count()
2441100
# TEST Identify common tokens from the full dataset (4f)
similarityTest = similaritiesFullRDD.filter(lambda ((aID, gURL), cs): aID == ‘b00005lzly’ and gURL == ‘http://www.google.com/base/feeds/snippets/13823221823254120257‘).collect()
Test.assertEquals(len(similarityTest), 1, ‘incorrect len(similarityTest)’)
Test.assertTrue(abs(similarityTest[0][1] - 4.286548414e-06) < 0.000000000001, ‘incorrect similarityTest fastCosineSimilarity’)
Test.assertEquals(similaritiesFullRDD.count(), 2441100, ‘incorrect similaritiesFullRDD.count()’)
1 test passed.
1 test passed.
1 test passed.
4.286548414e-06
Note: In this part, we use the “gold standard” mapping from the included file to look up true duplicates, and the results of Part 4.
Note: In this part, you will not be writing any code. We’ve written all of the code for you. Run each cell and then answer the quiz questions on Studio.
simsFullRDD
from our similaritiesFullRDD
that consists of a pair of ((Amazon ID, Google URL), simlarity score)goldStandard
RDD and simsFullRDD
and extract the# Create an RDD of ((Amazon ID, Google URL), similarity score)
simsFullRDD = similaritiesFullRDD.map(lambda x: (“%s %s” % (x[0][0], x[0][1]), x[1]))
assert (simsFullRDD.count() == 2441100)
# Create an RDD of just the similarity scores
simsFullValuesRDD = (simsFullRDD
.map(lambda x: x[1])
.cache())
assert (simsFullValuesRDD.count() == 2441100)
# Look up all similarity scores for true duplicates
# This helper function will return the similarity score for records that are in the gold standard and the simsFullRDD (True positives), and will return 0 for records that are in the gold standard but not in simsFullRDD (False Negatives).
def gs_value(record):
if (record[1][1] is None):
return 0
else:
return record[1][1]
# Join the gold standard and simsFullRDD, and then extract the similarities scores using the helper function
trueDupSimsRDD = (goldStandard
.leftOuterJoin(simsFullRDD)
.map(gs_value)
.cache())
print ‘There are %s true duplicates.’ % trueDupSimsRDD.count()
assert(trueDupSimsRDD.count() == 1300)
There are 1300 true duplicates.
VectorAccumulatorParam
, along with functions to initialize the accumulator’s vector to zero, and to add two vectors. Note that we have to use the += operator because you can only add to an accumulator.from pyspark.accumulators import AccumulatorParam
class VectorAccumulatorParam(AccumulatorParam):
# Initialize the VectorAccumulator to 0
def zero(self, value):
return [0] * len(value)
# Add two VectorAccumulator variables
def addInPlace(self, val1, val2):
for i in xrange(len(val1)):
val1[i] += val2[i]
return val1
def set_bit(x, value, length):
bits = []
for y in xrange(length):
if (x == y):
bits.append(value)
else:
bits.append(0)
return bits
BINS = 101
nthresholds = 100
def bin(similarity):
return int(similarity * nthresholds)
zeros = [0] * BINS
fpCounts = sc.accumulator(zeros, VectorAccumulatorParam())
def add_element(score):
global fpCounts
b = bin(score)
fpCounts += set_bit(b, 1, BINS)
simsFullValuesRDD.foreach(add_element)
def sub_element(score):
global fpCounts
b = bin(score)
fpCounts += set_bit(b, -1, BINS)
trueDupSimsRDD.foreach(sub_element)
def falsepos(threshold):
fpList = fpCounts.value
return sum([fpList[b] for b in range(0, BINS) if float(b) / nthresholds >= threshold])
def falseneg(threshold):
return trueDupSimsRDD.filter(lambda x: x < threshold).count()
def truepos(threshold):
return trueDupSimsRDD.count() - falsenegDict[threshold]
# Precision = true-positives / (true-positives + false-positives)
# Recall = true-positives / (true-positives + false-negatives)
# F-measure = 2 x Recall x Precision / (Recall + Precision)
def precision(threshold):
tp = trueposDict[threshold]
return float(tp) / (tp + falseposDict[threshold])
def recall(threshold):
tp = trueposDict[threshold]
return float(tp) / (tp + falsenegDict[threshold])
def fmeasure(threshold):
r = recall(threshold)
p = precision(threshold)
return 2 * r * p / (r + p)
nthresholds
(above in part (5a)) to change the threshold values to plot.thresholds = [float(n) / nthresholds for n in range(0, nthresholds)]
falseposDict = dict([(t, falsepos(t)) for t in thresholds])
falsenegDict = dict([(t, falseneg(t)) for t in thresholds])
trueposDict = dict([(t, truepos(t)) for t in thresholds])
precisions = [precision(t) for t in thresholds]
recalls = [recall(t) for t in thresholds]
fmeasures = [fmeasure(t) for t in thresholds]
print precisions[0], fmeasures[0]
assert (abs(precisions[0] - 0.000532546802671) < 0.0000001)
assert (abs(fmeasures[0] - 0.00106452669505) < 0.0000001)
fig = plt.figure()
plt.plot(thresholds, precisions)
plt.plot(thresholds, recalls)
plt.plot(thresholds, fmeasures)
plt.legend(['Precision', 'Recall', 'F-measure'])
pass
0.000532546802671 0.00106452669505