You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@spark.apache.org by Andrej Burja <an...@gmail.com> on 2015/07/18 10:56:42 UTC
PicklingError: Could not pickle object as excessively deep recursion required.

hi

on windows, in local mode, using pyspark i got an error about "excessively
deep recursion"
i'm using some module for lemmatizing/stemming, which uses some dll and
some binary files (module is a python wrapper around c code).
spark version 1.4.0
any idea what is going on?

---------------------------------------------------------------------------
PicklingError                             Traceback (most recent call last)
<ipython-input-10-f699414a7f1a> in <module>()
      1 df1 = df.map(lambda p: lemmatizer.lemmatize('working'))
----> 2 df1.take(1)

C:\spark/python\pyspark\rdd.pyc in take(self, num)
   1263
   1264             p = range(partsScanned, min(partsScanned +
numPartsToTry, totalParts))
-> 1265             res = self.context.runJob(self, takeUpToNumLeft, p,
True)
   1266
   1267             items += res

C:\spark/python\pyspark\context.pyc in runJob(self, rdd, partitionFunc,
partitions, allowLocal)
    878         # SparkContext#runJob.
    879         mappedRDD = rdd.mapPartitions(partitionFunc)
--> 880         port = self._jvm.PythonRDD.runJob(self._jsc.sc(),
mappedRDD._jrdd, partitions,
    881                                           allowLocal)
    882         return list(_load_from_socket(port,
mappedRDD._jrdd_deserializer))

C:\spark/python\pyspark\rdd.pyc in _jrdd(self)
   2349         command = (self.func, profiler,
self._prev_jrdd_deserializer,
   2350                    self._jrdd_deserializer)
-> 2351         pickled_cmd, bvars, env, includes =
_prepare_for_python_RDD(self.ctx, command, self)
   2352         python_rdd = self.ctx._jvm.PythonRDD(self._prev_jrdd.rdd(),
   2353                                              bytearray(pickled_cmd),

C:\spark/python\pyspark\rdd.pyc in _prepare_for_python_RDD(sc, command, obj)
   2269     # the serialized command will be compressed by broadcast
   2270     ser = CloudPickleSerializer()
-> 2271     pickled_command = ser.dumps(command)
   2272     if len(pickled_command) > (1 << 20):  # 1M
   2273         # The broadcast will have same life cycle as created
PythonRDD

C:\spark/python\pyspark\serializers.pyc in dumps(self, obj)
    425
    426     def dumps(self, obj):
--> 427         return cloudpickle.dumps(obj, 2)
    428
    429

C:\spark/python\pyspark\cloudpickle.pyc in dumps(obj, protocol)
    620
    621     cp = CloudPickler(file,protocol)
--> 622     cp.dump(obj)
    623
    624     return file.getvalue()

C:\spark/python\pyspark\cloudpickle.pyc in dump(self, obj)
    109             if 'recursion' in e.args[0]:
    110                 msg = """Could not pickle object as excessively
deep recursion required."""
--> 111                 raise pickle.PicklingError(msg)
    112
    113     def save_memoryview(self, obj):

PicklingError: Could not pickle object as excessively deep recursion
required.