join连接
from pyspark import SparkContext
if __name__ == "__main__":
master = "local"
if len(sys.argv) == 2:
master = sys.argv[1]
try:
sc.stop()
except:
pass
sc = SparkContext(master, 'test')
storeAddress = sc.parallelize( (("Ritual", "1026 Valencia St"),
("Philz", "748 Van Ness Ave"),
("Philz", "3101 24th St"),
("Starbucks", "Seattle")))
storeRating =sc.parallelize( (("Ritual", 4.9),
("Philz", 4.8)))
result = storeAddress.join(storeRating)
print(result.collect())
输出:(这是内连接)
[('Ritual', ('1026 Valencia St', 4.9)),
('Philz', ('748 Van Ness Ave', 4.8)),
('Philz', ('3101 24th St', 4.8))]
leftOuterJoin左连接
from pyspark import SparkContext
if __name__ == "__main__":
master = "local"
if len(sys.argv) == 2:
master = sys.argv[1]
try:
sc.stop()
except:
pass
sc = SparkContext(master, 'test')
storeAddress = sc.parallelize( (("Ritual", "1026 Valencia St"),
("Philz", "748 Van Ness Ave"),
("Philz", "3101 24th St"),
("Starbucks", "Seattle")))
storeRating =sc.parallelize( (("Ritual", 4.9),
("Philz", 4.8)))
result = storeAddress.leftOuterJoin(storeRating)
print(result.collect())
输出:相当于左外连接
[('Ritual', ('1026 Valencia St', 4.9)),
('Philz', ('748 Van Ness Ave', 4.8)),
('Philz', ('3101 24th St', 4.8)),
('Starbucks', ('Seattle', None))]
rightOuterJoin右连接
from pyspark import SparkContext
if __name__ == "__main__":
master = "local"
if len(sys.argv) == 2:
master = sys.argv[1]
try:
sc.stop()
except:
pass
sc = SparkContext(master, 'test')
storeAddress = sc.parallelize( (("Ritual", "1026 Valencia St"),
("Philz", "748 Van Ness Ave"),
("Philz", "3101 24th St"),
("Starbucks", "Seattle")))
storeRating =sc.parallelize( (("Ritual", 4.9),
("Philz", 4.8)))
result = storeAddress.rightOuterJoin(storeRating)
print(result.collect())
输出:
[('Ritual', ('1026 Valencia St', 4.9)),
('Philz', ('748 Van Ness Ave', 4.8)),
('Philz', ('3101 24th St', 4.8))]