CDH-pyspark-xgboost TypeError: ‘JavaPackage‘ object is not callable

使用CDH版本的pyspark进行xgboost训练时,一直报一个错误 “TypeError: ‘JavaPackage’ object is not callable”。起初以为是jar包版本或路径错误,因此尝试了多个版本的xgboost4j,包括0.72、0.81、0.9、1.0.0、1.1.2,以及分别使用
os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars /home/data/xgboost/xgboost4j-spark_0.90.jar,/home/data/xgboost/xgboost4j_0.90.jar pyspark-shell'

spark = SparkSession\
        .builder\
        .master('yarn') \
        .appName("PySpark XGBOOST")\
        .config('spark.jars',
            'hdfs://xgb/xgb_1.0.0/xgboost4j-spark_2.11-1.0.0.jar,hdfs://xgb/xgb_1.0.0/xgboost4j_2.11-1.0.0.jar') \
        .getOrCreate()

方式都不行。
CDH-pyspark-xgboost TypeError: ‘JavaPackage‘ object is not callable_第1张图片
决定分析源码。

# /opt/cloudera/parcels/CDH-6.3.2-1.cdh6.3.2.p0.1605554/lib/spark/python/pyspark/ml/wrapper.py
from pyspark.ml.util import _jvm
    @staticmethod
    def _new_java_obj(java_class, *args):
        """
        Returns a new Java object.
        """
        sc = SparkContext._active_spark_context
        java_obj = _jvm()
        for name in java_class.split("."):
            java_obj = getattr(java_obj, name)
        java_args = [_py2java(sc, arg) for arg in args]
        return java_obj(*java_args)

先来看看_jvm()返回的是什么?下方是测试代码

from pyspark.ml.util import _jvm
def _new_java_obj(java_class, *args):
    """
    Returns a new Java object.
    """
    sc = SparkContext._active_spark_context
    java_obj = _jvm()
    print('java_obj',':',java_obj)
    print('***************************')
    java_class_list = java_class.split(".")
    for name in java_class_list:
        i_index  = java_class_list.index(name) + 1
        com = '.'.join(java_class_list[0:i_index])
        java_obj = getattr(java_obj, name)
        command = f"r\nu\n{com}\nrj\ne\n"
        print('command',':',com)
        print('client answer',':',java_obj._gateway_client.send_command(command))
        print('java_obj',':',java_obj)
        print('-----------------------------')
    java_args = [_py2java(sc, arg) for arg in args]
    print(java_args)
    return java_obj(*java_args)
# _new_java_obj("com.microsoft.ml.spark.lightgbm.LightGBMRegressor")
# _new_java_obj('ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier')
_new_java_obj("org.apache.spark.ml.regression.GBTRegressor")

CDH-pyspark-xgboost TypeError: ‘JavaPackage‘ object is not callable_第2张图片
第一步 :java_obj 是一个py4j.java_gateway.JVMView object
第二到第六:java_obj是一个py4j.java_gateway.JavaPackage
第七:java_obj是一个py4j.java_gateway.JavaClass
如果传一个路径是无效的,例如

_new_java_obj('ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier')

CDH-pyspark-xgboost TypeError: ‘JavaPackage‘ object is not callable_第3张图片
最后返回一个JavaPackage,它是不能被调用的。

# /opt/cloudera/parcels/CDH-6.3.2-1.cdh6.3.2.p0.1605554/lib/spark/python/pyspark/ml/utils.py
from pyspark import SparkContext
def _jvm():
    """
    Returns the JVM view associated with SparkContext. Must be called
    after SparkContext is initialized.
    """
    jvm = SparkContext._jvm
    if jvm:
        return jvm
    else:
        raise AttributeError("Cannot load _jvm from SparkContext. Is SparkContext initialized?")
        
        
# /opt/cloudera/parcels/CDH-6.3.2-1.cdh6.3.2.p0.1605554/lib/spark/python/pyspark/ml/__init__.py      
from pyspark.context import SparkContext
# /opt/cloudera/parcels/CDH-6.3.2-1.cdh6.3.2.p0.1605554/lib/spark/python/pyspark/context.py
# line 298-299
from pyspark.java_gateway import launch_gateway
SparkContext._gateway = gateway or launch_gateway(conf=None)
SparkContext._jvm = SparkContext._gateway.jvm

# /opt/cloudera/parcels/CDH-6.3.2-1.cdh6.3.2.p0.1605554/lib/spark/python/pyspark/java_gateway.py
from py4j.java_gateway import java_import, JavaGateway, JavaObject, GatewayParameters
from pyspark.find_spark_home import _find_spark_home
def launch_gateway(conf=None):
    """
    launch jvm gateway
    :param conf: spark configuration passed to spark-submit
    :return:
    """
    if "PYSPARK_GATEWAY_PORT" in os.environ:
        gateway_port = int(os.environ["PYSPARK_GATEWAY_PORT"])
        gateway_secret = os.environ["PYSPARK_GATEWAY_SECRET"]
    else:
        SPARK_HOME = _find_spark_home()
        # Launch the Py4j gateway using Spark's run command so that we pick up the
        # proper classpath and settings from spark-env.sh
        on_windows = platform.system() == "Windows"
        script = "./bin/spark-submit.cmd" if on_windows else "./bin/spark-submit"
        command = [os.path.join(SPARK_HOME, script)]
        if conf:
            for k, v in conf.getAll():
                command += ['--conf', '%s=%s' % (k, v)]
        submit_args = os.environ.get("PYSPARK_SUBMIT_ARGS", "pyspark-shell")
        if os.environ.get("SPARK_TESTING"):
            submit_args = ' '.join([
                "--conf spark.ui.enabled=false",
                submit_args
            ])
        command = command + shlex.split(submit_args)

        # Create a temporary directory where the gateway server should write the connection
        # information.
        conn_info_dir = tempfile.mkdtemp()
        try:
            fd, conn_info_file = tempfile.mkstemp(dir=conn_info_dir)
            os.close(fd)
            os.unlink(conn_info_file)

            env = dict(os.environ)
            env["_PYSPARK_DRIVER_CONN_INFO_PATH"] = conn_info_file

            # Launch the Java gateway.
            # We open a pipe to stdin so that the Java gateway can die when the pipe is broken
            if not on_windows:
                # Don't send ctrl-c / SIGINT to the Java gateway:
                def preexec_func():
                    signal.signal(signal.SIGINT, signal.SIG_IGN)
                proc = Popen(command, stdin=PIPE, preexec_fn=preexec_func, env=env)
            else:
                # preexec_fn not supported on Windows
                proc = Popen(command, stdin=PIPE, env=env)

            # Wait for the file to appear, or for the process to exit, whichever happens first.
            while not proc.poll() and not os.path.isfile(conn_info_file):
                time.sleep(0.1)

            if not os.path.isfile(conn_info_file):
                raise Exception("Java gateway process exited before sending its port number")

            with open(conn_info_file, "rb") as info:
                gateway_port = read_int(info)
                gateway_secret = UTF8Deserializer().loads(info)
        finally:
            shutil.rmtree(conn_info_dir)

        # In Windows, ensure the Java child processes do not linger after Python has exited.
        # In UNIX-based systems, the child process can kill itself on broken pipe (i.e. when
        # the parent process' stdin sends an EOF). In Windows, however, this is not possible
        # because java.lang.Process reads directly from the parent process' stdin, contending
        # with any opportunity to read an EOF from the parent. Note that this is only best
        # effort and will not take effect if the python process is violently terminated.
        if on_windows:
            # In Windows, the child process here is "spark-submit.cmd", not the JVM itself
            # (because the UNIX "exec" command is not available). This means we cannot simply
            # call proc.kill(), which kills only the "spark-submit.cmd" process but not the
            # JVMs. Instead, we use "taskkill" with the tree-kill option "/t" to terminate all
            # child processes in the tree (http://technet.microsoft.com/en-us/library/bb491009.aspx)
            def killChild():
                Popen(["cmd", "/c", "taskkill", "/f", "/t", "/pid", str(proc.pid)])
            atexit.register(killChild)

    # Connect to the gateway
    gateway = JavaGateway(
        gateway_parameters=GatewayParameters(port=gateway_port, auth_token=gateway_secret,
                                             auto_convert=True))

    # Import the classes used by PySpark
    java_import(gateway.jvm, "org.apache.spark.SparkConf")
    java_import(gateway.jvm, "org.apache.spark.api.java.*")
    java_import(gateway.jvm, "org.apache.spark.api.python.*")
    java_import(gateway.jvm, "org.apache.spark.ml.python.*")
    java_import(gateway.jvm, "org.apache.spark.mllib.api.python.*")
    # TODO(davies): move into sql
    java_import(gateway.jvm, "org.apache.spark.sql.*")
    java_import(gateway.jvm, "org.apache.spark.sql.api.python.*")
    java_import(gateway.jvm, "org.apache.spark.sql.hive.*")
    java_import(gateway.jvm, "scala.Tuple2")

    return gateway

_jvm()返回的是JavaGateway(
gateway_parameters=GatewayParameters(port=gateway_port, auth_token=gateway_secret,
auto_convert=True)).jvm

#/opt/cloudera/parcels/CDH-6.3.2-1.cdh6.3.2.p0.1605554/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py
line 1789
        self.jvm = JVMView(
            self._gateway_client, jvm_name=proto.DEFAULT_JVM_NAME,
            id=proto.DEFAULT_JVM_ID)

因此_jvm()返回的是py4j.java_gateway里的一个JVMView对象。

#/opt/cloudera/parcels/CDH-6.3.2-1.cdh6.3.2.p0.1605554/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py
class JVMView(object):
    """A `JVMView` allows access to the Java Virtual Machine of a
       `JavaGateway`.

       This can be used to reference static members (fields and methods) and
       to call constructors.
    """

    def __init__(self, gateway_client, jvm_name, id=None, jvm_object=None):
        self._gateway_client = gateway_client
        self._jvm_name = jvm_name
        if id is not None:
            self._id = id
        elif jvm_object is not None:
            self._id = proto.REFERENCE_TYPE + jvm_object._get_object_id()
            # So that both JVMView instances (on Python and Java) have the
            # same lifecycle. Theoretically, JVMView could inherit from
            # JavaObject, but I would like to avoid the use of reflection
            # for regular Py4J classes.
            self._jvm_object = jvm_object

        self._dir_sequence_and_cache = (None, [])

    def __dir__(self):
        command = proto.DIR_COMMAND_NAME +\
            proto.DIR_JVMVIEW_SUBCOMMAND_NAME +\
            self._id + "\n" +\
            get_command_part(self._dir_sequence_and_cache[0]) +\
            proto.END_COMMAND_PART

        answer = self._gateway_client.send_command(command)
        return_value = get_return_value(
            answer, self._gateway_client, self._fqn, "__dir__")
        if return_value is not None:
            result = return_value.split("\n")
            # Theoretically, not thread safe, but the worst case scenario is
            # cache miss or double overwrite of the same method...
            self._dir_sequence_and_cache = (
                result[0], result[1:] + [UserHelpAutoCompletion.KEY])
        return self._dir_sequence_and_cache[1][:]

    def __getattr__(self, name):
        if name == UserHelpAutoCompletion.KEY:
            return UserHelpAutoCompletion()

        answer = self._gateway_client.send_command(
            proto.REFLECTION_COMMAND_NAME +
            proto.REFL_GET_UNKNOWN_SUB_COMMAND_NAME + name + "\n" + self._id +
            "\n" + proto.END_COMMAND_PART)
        if answer == proto.SUCCESS_PACKAGE:
            return JavaPackage(name, self._gateway_client, jvm_id=self._id)
        elif answer.startswith(proto.SUCCESS_CLASS):
            return JavaClass(
                answer[proto.CLASS_FQN_START:], self._gateway_client)
        else:
            _, error_message = get_error_message(answer)
            message = compute_exception_message(
                "{0} does not exist in the JVM".format(name), error_message)
            raise Py4JError(message)

这个对象拿到name之后,加了一些转义字符,然后发给_gateway_client.send_command。
然后返回一个answer。如果这个answer是‘yp’,那么生成一个JavaPackage,如果answer是‘yc‘开头的,则生成一个’JavaClass‘。

#/opt/cloudera/parcels/CDH-6.3.2-1.cdh6.3.2.p0.1605554/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py
class JavaPackage(object):
    """A `JavaPackage` represents part of a Java package from which Java
       classes can be accessed.

       Usually, `JavaPackage` are not initialized using their constructor, but
       they are created while accessing the `jvm` property of a gateway, e.g.,
       `gateway.jvm.java.lang`.
    """
    def __init__(self, fqn, gateway_client, jvm_id=None):
        self._fqn = fqn
        self._gateway_client = gateway_client
        if jvm_id is None:
            self._jvm_id = proto.DEFAULT_JVM_ID
        self._jvm_id = jvm_id

    def __dir__(self):
        return [UserHelpAutoCompletion.KEY]

    def __getattr__(self, name):
        if name == UserHelpAutoCompletion.KEY:
            return UserHelpAutoCompletion

        if name in ["__str__", "__repr__"]:
            raise AttributeError

        if name == "__call__":
            raise Py4JError("Trying to call a package.")
            
        new_fqn = self._fqn + "." + name
        command = proto.REFLECTION_COMMAND_NAME +\
            proto.REFL_GET_UNKNOWN_SUB_COMMAND_NAME +\
            new_fqn + "\n" +\
            self._jvm_id + "\n" +\
            proto.END_COMMAND_PART
        answer = self._gateway_client.send_command(command)
        if answer == proto.SUCCESS_PACKAGE:
            return JavaPackage(new_fqn, self._gateway_client, self._jvm_id)
        
        elif answer.startswith(proto.SUCCESS_CLASS):
            return JavaClass(
                answer[proto.CLASS_FQN_START:], self._gateway_client)
        else:
            raise Py4JError("{0} does not exist in the JVM".format(new_fqn))

_gateway_client.send_command是个什么东西?

    def _create_gateway_client(self):
        gateway_client = GatewayClient(
            gateway_parameters=self.gateway_parameters)
        return gateway_client

class GatewayClient(object):
    """Responsible for managing connections to the JavaGateway.

    This implementation is thread-safe and connections are created on-demand.
    This means that Py4J-Python can be accessed by multiple threads and
    messages are sent to and processed concurrently by the Java Gateway.

    When creating a custom :class:`JavaGateway`, it is recommended to pass an
    instance of :class:`GatewayClient` instead of a :class:`GatewayConnection`:
    both have the same interface, but the client supports multiple threads and
    connections, which is essential when using callbacks.  """

    def __init__(
            self, address=DEFAULT_ADDRESS, port=DEFAULT_PORT,
            auto_close=True, gateway_property=None,
            ssl_context=None, gateway_parameters=None):
        """
        :param gateway_parameters: the set of parameters used to configure the
            GatewayClient.

        :param gateway_property: used to keep gateway preferences without a
            cycle with the gateway
        """
        if address != DEFAULT_ADDRESS:
            deprecated("GatewayClient.address", "1.0", "GatewayParameters")
        if port != DEFAULT_PORT:
            deprecated("GatewayClient.port", "1.0", "GatewayParameters")

        if not gateway_parameters:
            gateway_parameters = GatewayParameters(
                address=address, port=port, auto_close=auto_close,
                ssl_context=ssl_context)

        self.gateway_parameters = gateway_parameters
        self.address = gateway_parameters.address
        self.port = gateway_parameters.port
        self.is_connected = True
        self.auto_close = gateway_parameters.auto_close
        self.gateway_property = gateway_property
        self.ssl_context = gateway_parameters.ssl_context
        self.deque = deque()

    def garbage_collect_object(self, target_id):
        """Tells the Java side that there is no longer a reference to this
        JavaObject on the Python side.
        """
        if target_id != proto.ENTRY_POINT_OBJECT_ID and\
                target_id != proto.GATEWAY_SERVER_OBJECT_ID and\
                self.is_connected:
            try:
                self.send_command(
                    proto.MEMORY_COMMAND_NAME +
                    proto.MEMORY_DEL_SUBCOMMAND_NAME +
                    target_id +
                    "\ne\n")
            except Exception:
                logger.debug("Exception while garbage collecting an object",
                             exc_info=True)

    def _get_connection(self):
        if not self.is_connected:
            raise Py4JNetworkError("Gateway is not connected.")
        try:
            connection = self.deque.pop()
        except IndexError:
            connection = self._create_connection()
        return connection

    def _create_connection(self):
        connection = GatewayConnection(
            self.gateway_parameters, self.gateway_property)
        connection.start()
        return connection

    def _give_back_connection(self, connection):
        try:
            self.deque.append(connection)
        except Exception:
            logger.warning(
                "Exception while giving back connection", exc_info=True)

    def shutdown_gateway(self):
        """Sends a shutdown command to the gateway. This will close the
           gateway server: all active connections will be closed. This may
           be useful if the lifecycle of the Java program must be tied to
           the Python program.
        """
        connection = self._get_connection()
        try:
            connection.shutdown_gateway()
            self.close()
            self.is_connected = False
        except Py4JNetworkError:
            logger.debug("Error while shutting down gateway.", exc_info=True)
            self.shutdown_gateway()

    def send_command(self, command, retry=True, binary=False):
        """Sends a command to the JVM. This method is not intended to be
           called directly by Py4J users. It is usually called by
           :class:`JavaMember` instances.

        :param command: the `string` command to send to the JVM. The command
         must follow the Py4J protocol.

        :param retry: if `True`, the GatewayClient tries to resend a message
         if it fails.

        :param binary: if `True`, we won't wait for a Py4J-protocol response
         from the other end; we'll just return the raw connection to the
         caller. The caller becomes the owner of the connection, and is
         responsible for closing the connection (or returning it this
         `GatewayClient` pool using `_give_back_connection`).

        :rtype: the `string` answer received from the JVM (The answer follows
         the Py4J protocol). The guarded `GatewayConnection` is also returned
         if `binary` is `True`.
        """
        connection = self._get_connection()
        try:
            response = connection.send_command(command)
            if binary:
                return response, self._create_connection_guard(connection)
            elif is_fatal_error(response):
                connection.close(False)
            else:
                self._give_back_connection(connection)
        except Py4JNetworkError as pne:
            if connection:
                reset = False
                if isinstance(pne.cause, socket.timeout):
                    reset = True
                connection.close(reset)
            if self._should_retry(retry, connection, pne):
                logging.info("Exception while sending command.", exc_info=True)
                response = self.send_command(command, binary=binary)
            else:
                logging.exception(
                    "Exception while sending command.")
                response = proto.ERROR

        return response

    def _create_connection_guard(self, connection):
        return GatewayConnectionGuard(self, connection)

    def _should_retry(self, retry, connection, pne=None):
        return pne and pne.when == proto.ERROR_ON_SEND

    def close(self):
        """Closes all currently opened connections.

        This operation is not thread safe and is only a best effort strategy
        to close active connections.

        All connections are guaranteed to be closed only if no other thread
        is accessing the client and no call is pending.
        """
        size = len(self.deque)
        for _ in range(0, size):
            try:
                connection = self.deque.pop()
                quiet_close(connection)
            except IndexError:
                pass

self.gateway_parameters就是上面launch_gateway 里面的GatewayParameters(port=gateway_port, auth_token=gateway_secret,auto_convert=True)。
我们把launch_gateway分解

conf = None
SPARK_HOME = _find_spark_home()
# Launch the Py4j gateway using Spark's run command so that we pick up the
# proper classpath and settings from spark-env.sh
on_windows = platform.system() == "Windows"
script = "./bin/spark-submit.cmd" if on_windows else "./bin/spark-submit"
command = [os.path.join(SPARK_HOME, script)]
if conf:
    for k, v in conf.getAll():
        command += ['--conf', '%s=%s' % (k, v)]
submit_args = os.environ.get("PYSPARK_SUBMIT_ARGS", "pyspark-shell")
if os.environ.get("SPARK_TESTING"):
    submit_args = ' '.join([
        "--conf spark.ui.enabled=false",
        submit_args
    ])
command = command + shlex.split(submit_args)
command

[’/opt/cloudera/parcels/CDH-6.3.2-1.cdh6.3.2.p0.1605554/lib/spark/./bin/spark-submit’,
‘–name’,
‘PySparkShell’,
‘pyspark-shell’]

conn_info_dir = tempfile.mkdtemp()
conn_info_dir

‘/tmp/tmpmtntjgjm’

fd, conn_info_file = tempfile.mkstemp(dir=conn_info_dir)
print(fd)
print(conn_info_file)
os.close(fd)
os.unlink(conn_info_file)

env = dict(os.environ)
env["_PYSPARK_DRIVER_CONN_INFO_PATH"] = conn_info_file

63
/tmp/tmpmtntjgjm/tmpzgljq1n4

if not on_windows:
    # Don't send ctrl-c / SIGINT to the Java gateway:
    def preexec_func():
        signal.signal(signal.SIGINT, signal.SIG_IGN)
    proc = Popen(command, stdin=PIPE, preexec_fn=preexec_func, env=env)
    print(proc)
else:
    # preexec_fn not supported on Windows
    proc = Popen(command, stdin=PIPE, env=env)

while not proc.poll() and not os.path.isfile(conn_info_file):
    time.sleep(0.1)
if not os.path.isfile(conn_info_file):
    raise Exception("Java gateway process exited before sending its port number")
with open(conn_info_file, "rb") as info:
    gateway_port = read_int(info)
    gateway_secret = UTF8Deserializer().loads(info)
    print(gateway_port)
    print(gateway_secret)

42721
460481f49d41e47b6a1273c9ad9fb4dcd0b131ef68760c037cd97c434f5123ae

shutil.rmtree(conn_info_dir)
gateway = JavaGateway(
    gateway_parameters=GatewayParameters(port=gateway_port, auth_token=gateway_secret,
                                         auto_convert=True))
gateway_parameters = GatewayParameters(port=gateway_port, auth_token=gateway_secret,
                                         auto_convert=True)
print(gateway_parameters.address)
print(gateway_parameters.port)

‘127.0.0.1’
42721
最终解决方式是把这两个jar包放到各个节点的CDH/lib/spark/jars/目录下。

你可能感兴趣的:(pyspark,spark,big,data,java)