Tuesday, May 17, 2016
Learning basic PySpark concepts with S3
I'm learning Apache Spark with Python and S3. I tried them out following an example here https://www.codementor.io/spark/tutorial/spark-python-rdd-basics.
import boto
AWS_ACCESS_KEY_ID="myAccessKeyId"
AWS_SECRET_ACCESS_KEY="mySecretAccessKey"
conn = boto.connect_s3(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
bucket = conn.get_bucket('nuhaa')
# view list
for file_key in bucket.list():
print(file_key.name)
key_name = "kddcup-data_10_percent.csv"
# load data
data_rdd = sc.textFile("s3n://nuhaa/%s" % key_name)
from time import time
t0 = time()
data_count = data_rdd.count()
tt = time() - t0
print "total number of records: %d" % data_count
print "size %s" % hbytes(key.size)
print "count completed in {} seconds".format(round(tt,3))
key.close()
# using filter
normal_data_rdd = data_rdd.filter(lambda x: 'normal.' in x)
t0 = time()
normal_count = normal_data_rdd.count()
tt = time() - t0
print "there are {} 'normal' interactions".format(normal_count)
print "count completed in {} seconds".format(round(tt,3))
# using map
csv_data = data_rdd.map(lambda x: x.split(","))
t0 = time()
head_rows = csv_data.take(5)
tt = time() - t0
print "parse completed in {} seconds".format(round(tt,3))
# using collect
t0 = time()
all_data = data_rdd.collect()
tt = time() - t0
print "data collected in {} seconds".format(round(tt,3))
Subscribe to:
Posts (Atom)