参考:https://github.com/nalepae/pandarallel/issues/127
在使用pandarallel报错OSError: [Errno 28] No space left on device,根据上述issue发现确实默认使用的MEMORY_FS_ROOT为 /dev/shm,而在docker环境下这个目录大小只有64M,完全不够处理数据
因此,需要添加如下代码:
os.environ['MEMORY_FS_ROOT'] = "/data"
os.environ['JOBLIB_TEMP_FOLDER'] = "/data"
一开始已经加了参数而而在pandarallel/core.py中依旧没有被覆盖改写,后来经过打印os.environ发现传入的参数没有覆盖成功,需要将上述代码
import os
os.environ['MEMORY_FS_ROOT'] = "/data"
os.environ['JOBLIB_TEMP_FOLDER'] = "/data"
放在执行文件的最上面即可成功覆盖参数。
然后收到如下报错:
File "/root/miniconda3/envs/kb_search/lib/python3.10/site-packages/pandarallel/core.py", line 422, in closure
pool = CONTEXT.Pool(nb_workers)
File "/root/miniconda3/envs/kb_search/lib/python3.10/multiprocessing/context.py", line 119, in Pool
return Pool(processes, initializer, initargs, maxtasksperchild,
File "/root/miniconda3/envs/kb_search/lib/python3.10/multiprocessing/pool.py", line 191, in __init__
self._setup_queues()
File "/root/miniconda3/envs/kb_search/lib/python3.10/multiprocessing/pool.py", line 346, in _setup_queues
self._inqueue = self._ctx.SimpleQueue()
File "/root/miniconda3/envs/kb_search/lib/python3.10/multiprocessing/context.py", line 113, in SimpleQueue
return SimpleQueue(ctx=self.get_context())
File "/root/miniconda3/envs/kb_search/lib/python3.10/multiprocessing/queues.py", line 341, in __init__
self._rlock = ctx.Lock()
File "/root/miniconda3/envs/kb_search/lib/python3.10/multiprocessing/context.py", line 68, in Lock
return Lock(ctx=self.get_context())
File "/root/miniconda3/envs/kb_search/lib/python3.10/multiprocessing/synchronize.py", line 163, in __init__
SemLock.__init__(self, SEMAPHORE, 1, 1, ctx=ctx)
File "/root/miniconda3/envs/kb_search/lib/python3.10/multiprocessing/synchronize.py", line 58, in __init__
sl = self._semlock = _multiprocessing.SemLock(
OSError: [Errno 28] No space left on device
这个时候在parallel_apply处报错,仍然报错磁盘没有空间,但是指定的磁盘还有1.2T的空间。
查看代码报错:
https://fossies.org/dox/openslides-2.3-portable/classmultiprocessing_1_1synchronize_1_1SemLock.html
定义:
class SemLock(object):
_rand = tempfile._RandomNameSequence()
def __init__(self, kind, value, maxvalue, *, ctx):
if ctx is None:
ctx = context._default_context.get_context()
name = ctx.get_start_method()
unlink_now = sys.platform == 'win32' or name == 'fork'
for i in range(100):
try:
sl = self._semlock = _multiprocessing.SemLock(
kind, value, maxvalue, self._make_name(),
unlink_now)
except FileExistsError:
pass
else:
break
else:
raise FileExistsError('cannot find name for semaphore')
util.debug('created semlock with handle %s' % sl.handle)
self._make_methods()
if sys.platform != 'win32':
def _after_fork(obj):
obj._semlock._after_fork()
util.register_after_fork(self, _after_fork)
if self._semlock.name is not None:
# We only get here if we are on Unix with forking
# disabled. When the object is garbage collected or the
# process shuts down we unlink the semaphore name
from .semaphore_tracker import register
register(self._semlock.name)
util.Finalize(self, SemLock._cleanup, (self._semlock.name,),
exitpriority=0)
@staticmethod
def _cleanup(name):
from .semaphore_tracker import unregister
sem_unlink(name)
unregister(name)
def _make_methods(self):
self.acquire = self._semlock.acquire
self.release = self._semlock.release
def __enter__(self):
return self._semlock.__enter__()
def __exit__(self, *args):
return self._semlock.__exit__(*args)
def __getstate__(self):
context.assert_spawning(self)
sl = self._semlock
if sys.platform == 'win32':
h = context.get_spawning_popen().duplicate_for_child(sl.handle)
else:
h = sl.handle
return (h, sl.kind, sl.maxvalue, sl.name)
def __setstate__(self, state):
self._semlock = _multiprocessing.SemLock._rebuild(*state)
util.debug('recreated blocker with handle %r' % state[0])
self._make_methods()
@staticmethod
def _make_name():
return '%s-%s' % (process.current_process()._config['semprefix'],
next(SemLock._rand))
报错语句:
sl = self._semlock = _multiprocessing.SemLock(
kind, value, maxvalue, self._make_name(),
unlink_now)
_multiprocessing.SemLock(kind, value, maxvalue, self._make_name(),unlink_now)
实际参数为_multiprocessing.SemLock(1 ,1 ,1 ,‘/mp-ef2p1_pn’, True)
使用test.py
import os
os.environ['MEMORY_FS_ROOT'] = "/data"
os.environ['JOBLIB_TEMP_FOLDER'] = "/data"
import _multiprocessing
_multiprocessing.SemLock(1 ,1 ,1 ,'/mp-ef2p1_pn', True)
# _multiprocessing.SemLock(1 ,1 ,1 ,'/mp-ef2p1_pn', False)
依旧收到报错
https://github.com/nalepae/pandarallel/issues/143
解决不了,放弃