使用CDH版本的pyspark进行xgboost训练时,一直报一个错误 “TypeError: ‘JavaPackage’ object is not callable”。起初以为是jar包版本或路径错误,因此尝试了多个版本的xgboost4j,包括0.72、0.81、0.9、1.0.0、1.1.2,以及分别使用
os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars /home/data/xgboost/xgboost4j-spark_0.90.jar,/home/data/xgboost/xgboost4j_0.90.jar pyspark-shell'
和
spark = SparkSession\
.builder\
.master('yarn') \
.appName("PySpark XGBOOST")\
.config('spark.jars',
'hdfs://xgb/xgb_1.0.0/xgboost4j-spark_2.11-1.0.0.jar,hdfs://xgb/xgb_1.0.0/xgboost4j_2.11-1.0.0.jar') \
.getOrCreate()
# /opt/cloudera/parcels/CDH-6.3.2-1.cdh6.3.2.p0.1605554/lib/spark/python/pyspark/ml/wrapper.py
from pyspark.ml.util import _jvm
@staticmethod
def _new_java_obj(java_class, *args):
"""
Returns a new Java object.
"""
sc = SparkContext._active_spark_context
java_obj = _jvm()
for name in java_class.split("."):
java_obj = getattr(java_obj, name)
java_args = [_py2java(sc, arg) for arg in args]
return java_obj(*java_args)
先来看看_jvm()返回的是什么?下方是测试代码
from pyspark.ml.util import _jvm
def _new_java_obj(java_class, *args):
"""
Returns a new Java object.
"""
sc = SparkContext._active_spark_context
java_obj = _jvm()
print('java_obj',':',java_obj)
print('***************************')
java_class_list = java_class.split(".")
for name in java_class_list:
i_index = java_class_list.index(name) + 1
com = '.'.join(java_class_list[0:i_index])
java_obj = getattr(java_obj, name)
command = f"r\nu\n{com}\nrj\ne\n"
print('command',':',com)
print('client answer',':',java_obj._gateway_client.send_command(command))
print('java_obj',':',java_obj)
print('-----------------------------')
java_args = [_py2java(sc, arg) for arg in args]
print(java_args)
return java_obj(*java_args)
# _new_java_obj("com.microsoft.ml.spark.lightgbm.LightGBMRegressor")
# _new_java_obj('ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier')
_new_java_obj("org.apache.spark.ml.regression.GBTRegressor")
第一步 :java_obj 是一个py4j.java_gateway.JVMView object
第二到第六:java_obj是一个py4j.java_gateway.JavaPackage
第七:java_obj是一个py4j.java_gateway.JavaClass
如果传一个路径是无效的,例如
_new_java_obj('ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier')
# /opt/cloudera/parcels/CDH-6.3.2-1.cdh6.3.2.p0.1605554/lib/spark/python/pyspark/ml/utils.py
from pyspark import SparkContext
def _jvm():
"""
Returns the JVM view associated with SparkContext. Must be called
after SparkContext is initialized.
"""
jvm = SparkContext._jvm
if jvm:
return jvm
else:
raise AttributeError("Cannot load _jvm from SparkContext. Is SparkContext initialized?")
# /opt/cloudera/parcels/CDH-6.3.2-1.cdh6.3.2.p0.1605554/lib/spark/python/pyspark/ml/__init__.py
from pyspark.context import SparkContext
# /opt/cloudera/parcels/CDH-6.3.2-1.cdh6.3.2.p0.1605554/lib/spark/python/pyspark/context.py
# line 298-299
from pyspark.java_gateway import launch_gateway
SparkContext._gateway = gateway or launch_gateway(conf=None)
SparkContext._jvm = SparkContext._gateway.jvm
# /opt/cloudera/parcels/CDH-6.3.2-1.cdh6.3.2.p0.1605554/lib/spark/python/pyspark/java_gateway.py
from py4j.java_gateway import java_import, JavaGateway, JavaObject, GatewayParameters
from pyspark.find_spark_home import _find_spark_home
def launch_gateway(conf=None):
"""
launch jvm gateway
:param conf: spark configuration passed to spark-submit
:return:
"""
if "PYSPARK_GATEWAY_PORT" in os.environ:
gateway_port = int(os.environ["PYSPARK_GATEWAY_PORT"])
gateway_secret = os.environ["PYSPARK_GATEWAY_SECRET"]
else:
SPARK_HOME = _find_spark_home()
# Launch the Py4j gateway using Spark's run command so that we pick up the
# proper classpath and settings from spark-env.sh
on_windows = platform.system() == "Windows"
script = "./bin/spark-submit.cmd" if on_windows else "./bin/spark-submit"
command = [os.path.join(SPARK_HOME, script)]
if conf:
for k, v in conf.getAll():
command += ['--conf', '%s=%s' % (k, v)]
submit_args = os.environ.get("PYSPARK_SUBMIT_ARGS", "pyspark-shell")
if os.environ.get("SPARK_TESTING"):
submit_args = ' '.join([
"--conf spark.ui.enabled=false",
submit_args
])
command = command + shlex.split(submit_args)
# Create a temporary directory where the gateway server should write the connection
# information.
conn_info_dir = tempfile.mkdtemp()
try:
fd, conn_info_file = tempfile.mkstemp(dir=conn_info_dir)
os.close(fd)
os.unlink(conn_info_file)
env = dict(os.environ)
env["_PYSPARK_DRIVER_CONN_INFO_PATH"] = conn_info_file
# Launch the Java gateway.
# We open a pipe to stdin so that the Java gateway can die when the pipe is broken
if not on_windows:
# Don't send ctrl-c / SIGINT to the Java gateway:
def preexec_func():
signal.signal(signal.SIGINT, signal.SIG_IGN)
proc = Popen(command, stdin=PIPE, preexec_fn=preexec_func, env=env)
else:
# preexec_fn not supported on Windows
proc = Popen(command, stdin=PIPE, env=env)
# Wait for the file to appear, or for the process to exit, whichever happens first.
while not proc.poll() and not os.path.isfile(conn_info_file):
time.sleep(0.1)
if not os.path.isfile(conn_info_file):
raise Exception("Java gateway process exited before sending its port number")
with open(conn_info_file, "rb") as info:
gateway_port = read_int(info)
gateway_secret = UTF8Deserializer().loads(info)
finally:
shutil.rmtree(conn_info_dir)
# In Windows, ensure the Java child processes do not linger after Python has exited.
# In UNIX-based systems, the child process can kill itself on broken pipe (i.e. when
# the parent process' stdin sends an EOF). In Windows, however, this is not possible
# because java.lang.Process reads directly from the parent process' stdin, contending
# with any opportunity to read an EOF from the parent. Note that this is only best
# effort and will not take effect if the python process is violently terminated.
if on_windows:
# In Windows, the child process here is "spark-submit.cmd", not the JVM itself
# (because the UNIX "exec" command is not available). This means we cannot simply
# call proc.kill(), which kills only the "spark-submit.cmd" process but not the
# JVMs. Instead, we use "taskkill" with the tree-kill option "/t" to terminate all
# child processes in the tree (http://technet.microsoft.com/en-us/library/bb491009.aspx)
def killChild():
Popen(["cmd", "/c", "taskkill", "/f", "/t", "/pid", str(proc.pid)])
atexit.register(killChild)
# Connect to the gateway
gateway = JavaGateway(
gateway_parameters=GatewayParameters(port=gateway_port, auth_token=gateway_secret,
auto_convert=True))
# Import the classes used by PySpark
java_import(gateway.jvm, "org.apache.spark.SparkConf")
java_import(gateway.jvm, "org.apache.spark.api.java.*")
java_import(gateway.jvm, "org.apache.spark.api.python.*")
java_import(gateway.jvm, "org.apache.spark.ml.python.*")
java_import(gateway.jvm, "org.apache.spark.mllib.api.python.*")
# TODO(davies): move into sql
java_import(gateway.jvm, "org.apache.spark.sql.*")
java_import(gateway.jvm, "org.apache.spark.sql.api.python.*")
java_import(gateway.jvm, "org.apache.spark.sql.hive.*")
java_import(gateway.jvm, "scala.Tuple2")
return gateway
_jvm()返回的是JavaGateway(
gateway_parameters=GatewayParameters(port=gateway_port, auth_token=gateway_secret,
auto_convert=True)).jvm
#/opt/cloudera/parcels/CDH-6.3.2-1.cdh6.3.2.p0.1605554/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py
line 1789
self.jvm = JVMView(
self._gateway_client, jvm_name=proto.DEFAULT_JVM_NAME,
id=proto.DEFAULT_JVM_ID)
因此_jvm()返回的是py4j.java_gateway里的一个JVMView对象。
#/opt/cloudera/parcels/CDH-6.3.2-1.cdh6.3.2.p0.1605554/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py
class JVMView(object):
"""A `JVMView` allows access to the Java Virtual Machine of a
`JavaGateway`.
This can be used to reference static members (fields and methods) and
to call constructors.
"""
def __init__(self, gateway_client, jvm_name, id=None, jvm_object=None):
self._gateway_client = gateway_client
self._jvm_name = jvm_name
if id is not None:
self._id = id
elif jvm_object is not None:
self._id = proto.REFERENCE_TYPE + jvm_object._get_object_id()
# So that both JVMView instances (on Python and Java) have the
# same lifecycle. Theoretically, JVMView could inherit from
# JavaObject, but I would like to avoid the use of reflection
# for regular Py4J classes.
self._jvm_object = jvm_object
self._dir_sequence_and_cache = (None, [])
def __dir__(self):
command = proto.DIR_COMMAND_NAME +\
proto.DIR_JVMVIEW_SUBCOMMAND_NAME +\
self._id + "\n" +\
get_command_part(self._dir_sequence_and_cache[0]) +\
proto.END_COMMAND_PART
answer = self._gateway_client.send_command(command)
return_value = get_return_value(
answer, self._gateway_client, self._fqn, "__dir__")
if return_value is not None:
result = return_value.split("\n")
# Theoretically, not thread safe, but the worst case scenario is
# cache miss or double overwrite of the same method...
self._dir_sequence_and_cache = (
result[0], result[1:] + [UserHelpAutoCompletion.KEY])
return self._dir_sequence_and_cache[1][:]
def __getattr__(self, name):
if name == UserHelpAutoCompletion.KEY:
return UserHelpAutoCompletion()
answer = self._gateway_client.send_command(
proto.REFLECTION_COMMAND_NAME +
proto.REFL_GET_UNKNOWN_SUB_COMMAND_NAME + name + "\n" + self._id +
"\n" + proto.END_COMMAND_PART)
if answer == proto.SUCCESS_PACKAGE:
return JavaPackage(name, self._gateway_client, jvm_id=self._id)
elif answer.startswith(proto.SUCCESS_CLASS):
return JavaClass(
answer[proto.CLASS_FQN_START:], self._gateway_client)
else:
_, error_message = get_error_message(answer)
message = compute_exception_message(
"{0} does not exist in the JVM".format(name), error_message)
raise Py4JError(message)
这个对象拿到name之后,加了一些转义字符,然后发给_gateway_client.send_command。
然后返回一个answer。如果这个answer是‘yp’,那么生成一个JavaPackage,如果answer是‘yc‘开头的,则生成一个’JavaClass‘。
#/opt/cloudera/parcels/CDH-6.3.2-1.cdh6.3.2.p0.1605554/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py
class JavaPackage(object):
"""A `JavaPackage` represents part of a Java package from which Java
classes can be accessed.
Usually, `JavaPackage` are not initialized using their constructor, but
they are created while accessing the `jvm` property of a gateway, e.g.,
`gateway.jvm.java.lang`.
"""
def __init__(self, fqn, gateway_client, jvm_id=None):
self._fqn = fqn
self._gateway_client = gateway_client
if jvm_id is None:
self._jvm_id = proto.DEFAULT_JVM_ID
self._jvm_id = jvm_id
def __dir__(self):
return [UserHelpAutoCompletion.KEY]
def __getattr__(self, name):
if name == UserHelpAutoCompletion.KEY:
return UserHelpAutoCompletion
if name in ["__str__", "__repr__"]:
raise AttributeError
if name == "__call__":
raise Py4JError("Trying to call a package.")
new_fqn = self._fqn + "." + name
command = proto.REFLECTION_COMMAND_NAME +\
proto.REFL_GET_UNKNOWN_SUB_COMMAND_NAME +\
new_fqn + "\n" +\
self._jvm_id + "\n" +\
proto.END_COMMAND_PART
answer = self._gateway_client.send_command(command)
if answer == proto.SUCCESS_PACKAGE:
return JavaPackage(new_fqn, self._gateway_client, self._jvm_id)
elif answer.startswith(proto.SUCCESS_CLASS):
return JavaClass(
answer[proto.CLASS_FQN_START:], self._gateway_client)
else:
raise Py4JError("{0} does not exist in the JVM".format(new_fqn))
_gateway_client.send_command是个什么东西?
def _create_gateway_client(self):
gateway_client = GatewayClient(
gateway_parameters=self.gateway_parameters)
return gateway_client
class GatewayClient(object):
"""Responsible for managing connections to the JavaGateway.
This implementation is thread-safe and connections are created on-demand.
This means that Py4J-Python can be accessed by multiple threads and
messages are sent to and processed concurrently by the Java Gateway.
When creating a custom :class:`JavaGateway`, it is recommended to pass an
instance of :class:`GatewayClient` instead of a :class:`GatewayConnection`:
both have the same interface, but the client supports multiple threads and
connections, which is essential when using callbacks. """
def __init__(
self, address=DEFAULT_ADDRESS, port=DEFAULT_PORT,
auto_close=True, gateway_property=None,
ssl_context=None, gateway_parameters=None):
"""
:param gateway_parameters: the set of parameters used to configure the
GatewayClient.
:param gateway_property: used to keep gateway preferences without a
cycle with the gateway
"""
if address != DEFAULT_ADDRESS:
deprecated("GatewayClient.address", "1.0", "GatewayParameters")
if port != DEFAULT_PORT:
deprecated("GatewayClient.port", "1.0", "GatewayParameters")
if not gateway_parameters:
gateway_parameters = GatewayParameters(
address=address, port=port, auto_close=auto_close,
ssl_context=ssl_context)
self.gateway_parameters = gateway_parameters
self.address = gateway_parameters.address
self.port = gateway_parameters.port
self.is_connected = True
self.auto_close = gateway_parameters.auto_close
self.gateway_property = gateway_property
self.ssl_context = gateway_parameters.ssl_context
self.deque = deque()
def garbage_collect_object(self, target_id):
"""Tells the Java side that there is no longer a reference to this
JavaObject on the Python side.
"""
if target_id != proto.ENTRY_POINT_OBJECT_ID and\
target_id != proto.GATEWAY_SERVER_OBJECT_ID and\
self.is_connected:
try:
self.send_command(
proto.MEMORY_COMMAND_NAME +
proto.MEMORY_DEL_SUBCOMMAND_NAME +
target_id +
"\ne\n")
except Exception:
logger.debug("Exception while garbage collecting an object",
exc_info=True)
def _get_connection(self):
if not self.is_connected:
raise Py4JNetworkError("Gateway is not connected.")
try:
connection = self.deque.pop()
except IndexError:
connection = self._create_connection()
return connection
def _create_connection(self):
connection = GatewayConnection(
self.gateway_parameters, self.gateway_property)
connection.start()
return connection
def _give_back_connection(self, connection):
try:
self.deque.append(connection)
except Exception:
logger.warning(
"Exception while giving back connection", exc_info=True)
def shutdown_gateway(self):
"""Sends a shutdown command to the gateway. This will close the
gateway server: all active connections will be closed. This may
be useful if the lifecycle of the Java program must be tied to
the Python program.
"""
connection = self._get_connection()
try:
connection.shutdown_gateway()
self.close()
self.is_connected = False
except Py4JNetworkError:
logger.debug("Error while shutting down gateway.", exc_info=True)
self.shutdown_gateway()
def send_command(self, command, retry=True, binary=False):
"""Sends a command to the JVM. This method is not intended to be
called directly by Py4J users. It is usually called by
:class:`JavaMember` instances.
:param command: the `string` command to send to the JVM. The command
must follow the Py4J protocol.
:param retry: if `True`, the GatewayClient tries to resend a message
if it fails.
:param binary: if `True`, we won't wait for a Py4J-protocol response
from the other end; we'll just return the raw connection to the
caller. The caller becomes the owner of the connection, and is
responsible for closing the connection (or returning it this
`GatewayClient` pool using `_give_back_connection`).
:rtype: the `string` answer received from the JVM (The answer follows
the Py4J protocol). The guarded `GatewayConnection` is also returned
if `binary` is `True`.
"""
connection = self._get_connection()
try:
response = connection.send_command(command)
if binary:
return response, self._create_connection_guard(connection)
elif is_fatal_error(response):
connection.close(False)
else:
self._give_back_connection(connection)
except Py4JNetworkError as pne:
if connection:
reset = False
if isinstance(pne.cause, socket.timeout):
reset = True
connection.close(reset)
if self._should_retry(retry, connection, pne):
logging.info("Exception while sending command.", exc_info=True)
response = self.send_command(command, binary=binary)
else:
logging.exception(
"Exception while sending command.")
response = proto.ERROR
return response
def _create_connection_guard(self, connection):
return GatewayConnectionGuard(self, connection)
def _should_retry(self, retry, connection, pne=None):
return pne and pne.when == proto.ERROR_ON_SEND
def close(self):
"""Closes all currently opened connections.
This operation is not thread safe and is only a best effort strategy
to close active connections.
All connections are guaranteed to be closed only if no other thread
is accessing the client and no call is pending.
"""
size = len(self.deque)
for _ in range(0, size):
try:
connection = self.deque.pop()
quiet_close(connection)
except IndexError:
pass
self.gateway_parameters就是上面launch_gateway 里面的GatewayParameters(port=gateway_port, auth_token=gateway_secret,auto_convert=True)。
我们把launch_gateway分解
conf = None
SPARK_HOME = _find_spark_home()
# Launch the Py4j gateway using Spark's run command so that we pick up the
# proper classpath and settings from spark-env.sh
on_windows = platform.system() == "Windows"
script = "./bin/spark-submit.cmd" if on_windows else "./bin/spark-submit"
command = [os.path.join(SPARK_HOME, script)]
if conf:
for k, v in conf.getAll():
command += ['--conf', '%s=%s' % (k, v)]
submit_args = os.environ.get("PYSPARK_SUBMIT_ARGS", "pyspark-shell")
if os.environ.get("SPARK_TESTING"):
submit_args = ' '.join([
"--conf spark.ui.enabled=false",
submit_args
])
command = command + shlex.split(submit_args)
command
[’/opt/cloudera/parcels/CDH-6.3.2-1.cdh6.3.2.p0.1605554/lib/spark/./bin/spark-submit’,
‘–name’,
‘PySparkShell’,
‘pyspark-shell’]
conn_info_dir = tempfile.mkdtemp()
conn_info_dir
‘/tmp/tmpmtntjgjm’
fd, conn_info_file = tempfile.mkstemp(dir=conn_info_dir)
print(fd)
print(conn_info_file)
os.close(fd)
os.unlink(conn_info_file)
env = dict(os.environ)
env["_PYSPARK_DRIVER_CONN_INFO_PATH"] = conn_info_file
63
/tmp/tmpmtntjgjm/tmpzgljq1n4
if not on_windows:
# Don't send ctrl-c / SIGINT to the Java gateway:
def preexec_func():
signal.signal(signal.SIGINT, signal.SIG_IGN)
proc = Popen(command, stdin=PIPE, preexec_fn=preexec_func, env=env)
print(proc)
else:
# preexec_fn not supported on Windows
proc = Popen(command, stdin=PIPE, env=env)
while not proc.poll() and not os.path.isfile(conn_info_file):
time.sleep(0.1)
if not os.path.isfile(conn_info_file):
raise Exception("Java gateway process exited before sending its port number")
with open(conn_info_file, "rb") as info:
gateway_port = read_int(info)
gateway_secret = UTF8Deserializer().loads(info)
print(gateway_port)
print(gateway_secret)
42721
460481f49d41e47b6a1273c9ad9fb4dcd0b131ef68760c037cd97c434f5123ae
shutil.rmtree(conn_info_dir)
gateway = JavaGateway(
gateway_parameters=GatewayParameters(port=gateway_port, auth_token=gateway_secret,
auto_convert=True))
gateway_parameters = GatewayParameters(port=gateway_port, auth_token=gateway_secret,
auto_convert=True)
print(gateway_parameters.address)
print(gateway_parameters.port)
‘127.0.0.1’
42721
最终解决方式是把这两个jar包放到各个节点的CDH/lib/spark/jars/目录下。