blog.cawanpink.net: Learning basic PySpark concepts with S3

Tuesday, May 17, 2016

Learning basic PySpark concepts with S3

I'm learning Apache Spark with Python and S3. I tried them out following an example here https://www.codementor.io/spark/tutorial/spark-python-rdd-basics.

import boto

AWS_ACCESS_KEY_ID="myAccessKeyId"
AWS_SECRET_ACCESS_KEY="mySecretAccessKey"

conn = boto.connect_s3(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
bucket = conn.get_bucket('nuhaa')

# view list
for file_key in bucket.list():
print(file_key.name)

key_name = "kddcup-data_10_percent.csv"

# load data
data_rdd = sc.textFile("s3n://nuhaa/%s" % key_name)

from time import time
t0 = time()
data_count = data_rdd.count()
tt = time() - t0
print "total number of records: %d" % data_count
print "size %s" % hbytes(key.size)
print "count completed in {} seconds".format(round(tt,3))
key.close()

# using filter
normal_data_rdd = data_rdd.filter(lambda x: 'normal.' in x)
t0 = time()
normal_count = normal_data_rdd.count()
tt = time() - t0

print "there are {} 'normal' interactions".format(normal_count)
print "count completed in {} seconds".format(round(tt,3))

# using map
csv_data = data_rdd.map(lambda x: x.split(","))
t0 = time()
head_rows = csv_data.take(5)
tt = time() - t0

print "parse completed in {} seconds".format(round(tt,3))

# using collect
t0 = time()
all_data = data_rdd.collect()
tt = time() - t0

print "data collected in {} seconds".format(round(tt,3))