I am trying to broadcast a user defined variable in a PySpark application but I always have the following error:
File "/usr/local/spark-2.1.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 174, in main
process()
File "/usr/local/spark-2.1.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 169, in process
serializer.dump_stream(func(split_index, iterator), outfile)
File "/usr/local/spark-2.1.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/serializers.py", line 268, in dump_stream
vs = list(itertools.islice(iterator, batch))
File "/home/.../sparkbroad.py", line 29, in <lambda>
output = input_.map(lambda item: b.value.map(item))
File "/usr/local/spark-2.1.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/broadcast.py", line 106, in value
self._value = self.load(self._path)
File "/usr/local/spark-2.1.0-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/broadcast.py", line 97, in load
return pickle.load(f)
AttributeError: 'module' object has no attribute 'FooMap'
The code, in the module sparkbrad.py
is the following:
import random
import pyspark as spark
class FooMap(object):
def __init__(self):
keys = list(range(10))
values = [2 * key for key in keys]
self._map = dict(zip(keys, values))
def map(self, value):
if value not in self._map:
return -1
return self._map[value]
class FooMapJob(object):
def __init__(self, inputs):
self._inputs = inputs
self._foomap = FooMap()
def run(self):
sc = spark.SparkContext('local', 'FooMap')
input_ = sc.parallelize(self._inputs, 4)
b = sc.broadcast(self._foomap)
output = input_.map(lambda item: b.value.map(item))
b.unpersist()
result = list(output.toLocalIterator())
sc.stop()
return result
def main():
inputs = [random.randint(0, 10) for _ in range(10)]
job = FooMapJob(inputs)
print(job.run())
if __name__ == '__main__':
main()
and I am running it via the:
:~$ spark-submit --master local[4] --py-files sparkbroad.py sparkbroad.py
where I have added the --py-files
argument but it looks it doesn't change that much. Unfortunately, I could not find any example online dealing with broadcasting of complex classes (just lists or dictionaries). Any hint is appreciated. Thanks in advance.
UPDATE: placing the FooMap
class in a separate module, everything seems working fine, even without the --py-files
directive.
Placing the FooMap
class in a separate module, everything works fine.
This is an addition to the previous answer.
You should import FooMap
from another file not just define it in current file
maybe like this: in foo_map.py:
class FooMap(object):
def __init__(self):
keys = list(range(10))
values = [2 * key for key in keys]
self._map = dict(zip(keys, values))
def map(self, value):
if value not in self._map:
return -1
return self._map[value]
then in sparkbrad.py
from foo_map import FooMap
class FooMapJob(object):
def __init__(self, inputs):
self._inputs = inputs
self._foomap = FooMap()
def run(self):
sc = spark.SparkContext('local', 'FooMap')
input_ = sc.parallelize(self._inputs, 4)
b = sc.broadcast(self._foomap)
output = input_.map(lambda item: b.value.map(item))
b.unpersist()
result = list(output.toLocalIterator())
sc.stop()
return result
def main():
inputs = [random.randint(0, 10) for _ in range(10)]
job = FooMapJob(inputs)
print(job.run())
if __name__ == '__main__':
main()
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With