spark row在java和scala中实例化的方法


It is invalid to use the native primitive interface to retrieve a value that is null, instead a user must check isNullAt before attempting to retrieve a value that might be null.

To create a new Row, use RowFactory.create() in Java or Row.apply() in Scala.

A Row object can be constructed by providing field values. Example:

import org.apache.spark.sql._

// Create a Row from values.
Row(value1, value2, value3, ...)
// Create a Row from a Seq of values.
Row.fromSeq(Seq(value1, value2, ...))

A value of a row can be accessed through both generic access by ordinal, which will incur boxing overhead for primitives, as well as native primitive access. An example of generic access by ordinal:

import org.apache.spark.sql._

val row = Row(1, true, "a string", null)
// row: Row = [1,true,a string,null]
val firstValue = row(0)
// firstValue: Any = 1
val fourthValue = row(3)
// fourthValue: Any = null

For native primitive access, it is invalid to use the native primitive interface to retrieve a value that is null, instead a user must check isNullAt before attempting to retrieve a value that might be null. An example of native primitive access:

// using the row from the previous example.
val firstValue = row.getInt(0)
// firstValue: Int = 1
val isNull = row.isNullAt(3)
// isNull: Boolean = true




Example 1

public void test_getDataSetResult() {
public void test_getDataSetResult() {

    StructField[] structFields = new StructField[]{
            new StructField("intColumn", DataTypes.IntegerType, true, Metadata.empty()),
            new StructField("stringColumn", DataTypes.StringType, true, Metadata.empty())

    StructType structType = new StructType(structFields);

    List rows = new ArrayList<>();
    rows.add(RowFactory.create(1, "v1"));
    rows.add(RowFactory.create(2, "v2"));

    Dataset df = sparkSession.createDataFrame(rows, structType);

    DataSetResult dataSetResult = SparkUtils.getDataSetResult(df);
    Assert.assertEquals(2, dataSetResult.getColumnNames().size());
    Assert.assertEquals(2, dataSetResult.getRows().size());
    Assert.assertEquals(new Integer(1), dataSetResult.getRows().get(0).get(0));
    Assert.assertEquals("v1", dataSetResult.getRows().get(0).get(1));
    Assert.assertEquals(new Integer(2), dataSetResult.getRows().get(1).get(0));
    Assert.assertEquals("v2", dataSetResult.getRows().get(1).get(1));

Example 2

Project: bunsen   File:   Source Code and License 11 votes vote up
 * Reads the LOINC mutliaxial hierarchy file and converts it to a {@link HierarchicalElement}
 * dataset.
 * @param spark the Spark session
 * @param loincHierarchyPath path to the multiaxial hierarchy CSV
 * @return a dataset of {@link HierarchicalElement} representing the hierarchical relationship.
public static Dataset readMultiaxialHierarchyFile(SparkSession spark,
    String loincHierarchyPath) {

      .option("header", true)
      .select(col("IMMEDIATE_PARENT"), col("CODE"))
      .map((MapFunction) row -> {

        HierarchicalElement element = new HierarchicalElement();



        return element;
      }, Hierarchies.getHierarchicalElementEncoder());

Example 3

Project: gaffer-doc   File:   Source Code and License 9 votes vote up
public void getDataFrameOfElementsWithEntityGroup() {
    // ---------------------------------------------------------
    final GetDataFrameOfElements operation = new GetDataFrameOfElements.Builder()
            .view(new View.Builder()
    // ---------------------------------------------------------

    final Dataset df = runExample(operation, null);

    // Restrict to entities involving certain vertices
    final Dataset seeded = df.filter("vertex = 1 OR vertex = 2");
    String result = seeded.showString(100, 20);
    printJava("df.filter(\"vertex = 1 OR vertex = 2\").show();");
    print("The results are:\n");
    print(result.substring(0, result.length() - 2));

    // Filter by property
    final Dataset filtered = df.filter("count > 1");
    result = filtered.showString(100, 20);
    printJava("df.filter(\"count > 1\").show();");
    print("The results are:\n");
    print(result.substring(0, result.length() - 2));

Example 4

Project: PRoST   File:   Source Code and License 7 votes vote up
public Dataset computeJoins(SQLContext sqlContext){
	// compute all the joins
	Dataset results = node.computeJoinWithChildren(sqlContext);
	// select only the requested result
	Column [] selectedColumns = new Column[node.projection.size()];
	for (int i = 0; i < selectedColumns.length; i++) {
		selectedColumns[i]= new Column(node.projection.get(i));

	// if there is a filter set, apply it
	results =  filter == null ? : results.filter(filter).select(selectedColumns);
	// if results are distinct
	if(selectDistinct) results = results.distinct();
	return results;

Example 5

Project: integrations   File:   Source Code and License 7 votes vote up
public static String getSubjectIdentification( Row row ) {
    String name = row.getAs( "Defendant Name" );
    String gender = row.getAs( "Gender" );
    String race = row.getAs( "Race" );
    String dob = row.getAs( "DOB" );

    StringBuilder sb = new StringBuilder();
            .append( encoder.encodeToString( StringUtils.getBytesUtf8( name ) ) )
            .append( "|" )
            .append( encoder.encodeToString( StringUtils.getBytesUtf8( gender ) ) )
            .append( "|" )
            .append( encoder.encodeToString( StringUtils.getBytesUtf8( race ) ) )
            .append( "|" )
            .append( encoder.encodeToString( StringUtils.getBytesUtf8( dob ) ) );
    return sb.toString();

Example 6

Project: Explainer   File:   Source Code and License 7 votes vote up
public static List> constructListWithColumnNames(DataFrame dataframe,
    String[] columnNames) {

  List l;
  Row[] rows;

  List> list = new ArrayList<>();
  for (String name : columnNames) {
    l = new ArrayList<>();
    rows =;
    for (Row r : rows) {
  return list;


Example 7

Project: bunsen   File:   Source Code and License 6 votes vote up
public void coding() {

  Coding expectedCoding = condition.getSeverity().getCodingFirstRep();
  Coding actualCoding = decodedCondition.getSeverity().getCodingFirstRep();

  // Codings are a nested array, so we explode them into a table of the coding
  // fields so we can easily select and compare individual fields.
  Dataset severityCodings = conditionsDataset
      .select("coding.*") // Pull all fields in the coding to the top level.





Example 8

Project: embulk-input-parquet_hadoop   File:   Source Code and License 6 votes vote up
public List read() throws IOException
    spark.conf().set(SQLConf$.MODULE$.PARQUET_WRITE_LEGACY_FORMAT().key(), isLegacyFormat);

    Dataset dataFrame = spark.createDataFrame(data, schema).repartition(1);
    File file = new File(SparkTestBase.this.tempFolder.getRoot(), name);

    ArrayList results = new ArrayList<>();
    try (ParquetReader reader = ParquetReader
            .builder(new MessagePackReadSupport(), new Path(file.getPath()))
            .build()) {
        Value v;
        while ((v = != null) {
    return results;

Example 9

Project: rdf2x   File:   Source Code and License 6 votes vote up
 * Map a {@link Instance} into an Iterator of all of its relations
 * represented as rows of (related URI, predicate index, type index, instance ID)
 * @param instance the requested {@link Instance}
 * @return an Iterator of all of its relations represented as rows of (related URI, predicate index, type index, instance ID)
private Iterable getRelatedTypeIDs(Instance instance) {
    // typeIDs representing references to the instance in each table (or a single one, if instance has a single type)
    final Long id = instance.getId();

    final List> instanceTypeIDs = getRelationEntityTypes(instance)
            .map(typeIndex -> new Tuple2<>(typeIndex, id))

    return instance.getRelations().stream()
            .flatMap(relation ->
                            .map(instanceTypeID -> RowFactory.create(

Example 10

Project: MegaSparkDiff   File:   Source Code and License 6 votes vote up
private Pair, Dataset> returnDiff(String table1, String table2)
    AppleTable leftAppleTable = SparkFactory.parallelizeJDBCSource("org.hsqldb.jdbc.JDBCDriver",
            "(select * from " + table1 + ")", "table1");

    AppleTable rightAppleTable = SparkFactory.parallelizeJDBCSource("org.hsqldb.jdbc.JDBCDriver",
            "(select * from " + table2 + ")", "table2");

    return SparkCompare.compareAppleTables(leftAppleTable, rightAppleTable);

Example 11

Project: stonk   File:   Source Code and License 6 votes vote up
public static void main(String[] args) throws Exception {
    JavaSparkContext context = buildJavaSparkContext();

    Dataset dataset = SparkDataFileConverter.extractDataFrame(taskInfo, context);
    String mlAlgoName = taskInfo.getSparkTaskAlgorithm().getName();
    MLAlgorithmDesc mlAlgoDesc = MLAlgorithmLoader.getMLAlgorithmDesc(mlAlgoName);

    if (mlAlgoDesc.getComponentsType() == ComponentType.ESTIMATOR) {
        excuteEstimator(taskInfo, dataset);
    } else if (mlAlgoDesc.getComponentsType() == ComponentType.TRANSFORMER) {
        excuteTransformer(taskInfo, dataset);

Example 12

Project: MegaSparkDiff   File:   Source Code and License 6 votes vote up
 * Test of compareRdd method, of class SparkCompare.
public void testCompareRdd() {
    //code to get file1 location
    String file1Path = this.getClass().getClassLoader().
    String file2Path = this.getClass().getClassLoader().

    Pair, Dataset> comparisonResult = SparkCompare.compareFiles(file1Path, file2Path);

    try {
    } catch (Exception e) {"Straightforward output of test results somehow failed");

Example 13

Project: MegaSparkDiff   File:   Source Code and License 6 votes vote up
public void testCompareJDBCtpFileAppleTablesWithDifference()
    AppleTable leftAppleTable = SparkFactory.parallelizeJDBCSource("org.hsqldb.jdbc.JDBCDriver",
            "(select * from Persons1)", "table1");

    String file1Path = this.getClass().getClassLoader().

    AppleTable rightAppleTable = SparkFactory.parallelizeTextSource(file1Path,"table2");

    Pair, Dataset> pair = SparkCompare.compareAppleTables(leftAppleTable, rightAppleTable);

    //the expectation is one difference
    if (pair.getLeft().count() != 2)
    {"expected 2 different record in left");
    if (pair.getRight().count() != 5)
    {"expected 5 different record in right");

Example 14

Project: bunsen   File:   Source Code and License 6 votes vote up
 * Reads a Snomed relationship file and converts it to a {@link HierarchicalElement} dataset.
 * @param spark the Spark session
 * @param snomedRelationshipPath path to the SNOMED relationship file
 * @return a dataset of{@link HierarchicalElement} representing the hierarchical relationship.
public static Dataset readRelationshipFile(SparkSession spark,
    String snomedRelationshipPath) {

      .option("header", true)
      .option("delimiter", "\t")
      .select(col("destinationId"), col("sourceId"))
      .map((MapFunction) row -> {

        HierarchicalElement element = new HierarchicalElement();



        return element;
      }, Hierarchies.getHierarchicalElementEncoder());

Example 15

Project: PRoST   File:   Source Code and License 6 votes vote up
private TableStats calculate_stats_table(Dataset table, String tableName) {
	TableStats.Builder table_stats_builder = TableStats.newBuilder();
	// calculate the stats
	int table_size = (int) table.count();
	int distinct_subjects = (int);
	boolean is_complex = table_size != distinct_subjects;
	// put them in the protobuf object

Example 16

Project: Machine-Learning-End-to-Endguide-for-Java-developers   File:   Source Code and License 6 votes vote up
public static void main(String[] args) {
	SparkSession spark = SparkSession.builder()

	// Load and parse data
	String filePath = "/home/kchoppella/book/Chapter09/data/covtypeNorm.csv";

	// Loads data.
	Dataset inDataset =
			.option("header", "true")
			.option("inferSchema", true)
	ArrayList inputColsList = new ArrayList(Arrays.asList(inDataset.columns()));
	//Make single features column for feature vectors 
	String[] inputCols = inputColsList.parallelStream().toArray(String[]::new);
	//Prepare dataset for training with all features in "features" column
	VectorAssembler assembler = new VectorAssembler().setInputCols(inputCols).setOutputCol("features");
	Dataset dataset = assembler.transform(inDataset);

	PCAModel pca = new PCA()

	Dataset result = pca.transform(dataset).select("pcaFeatures");
	System.out.println("Explained variance:");
	// $example off$

Example 17

Project: uberscriptquery   File:   Source Code and License 6 votes vote up
public Object execute(SparkSession sparkSession, ActionStatement actionStatement, CredentialProvider credentialManager) {

    String filePath = actionStatement.getParamValues().get(0).getValue().toString();
    String saveModeStr = actionStatement.getParamValues().get(1).getValue().toString();
    String dfTableName = actionStatement.getParamValues().get(2).getValue().toString();

    SaveMode saveMode = SaveMode.valueOf(saveModeStr);

    String sql = String.format("select * from %s", dfTableName);"Running sql [%s] to get data and then save it", sql));
    Dataset df = sparkSession.sql(sql);"Saving to csv %s, saveMode: %s", filePath, saveMode));
    df.coalesce(1).write().mode(saveMode).option("header", "false").csv(filePath);"Saved to csv %s, saveMode: %s", filePath, saveMode));
    return null;

Example 18

Project: net.jgp.labs.spark.datasources   File:   Source Code and License 6 votes vote up
public RDD buildScan() {
    log.debug("-> buildScan()");

    // I have isolated the work to a method to keep the plumbing code as simple as
    // possible.
    List table = collectData();

    JavaSparkContext sparkContext = new JavaSparkContext(sqlContext.sparkContext());
    JavaRDD rowRDD = sparkContext.parallelize(table)
            .map(photo -> SparkBeanUtils.getRowFromBean(schema, photo));

    return rowRDD.rdd();

Example 19

Project: integrations   File:   Source Code and License 5 votes vote up
public static void main( String[] args ) throws InterruptedException {

        final String path = args[ 0 ];
        final String username = args[ 1 ];
        final String password = args[ 2 ];
        final SparkSession sparkSession = MissionControl.getSparkSession();
        final String jwtToken = MissionControl.getIdToken( username, password ); "Using the following idToken: Bearer {}", jwtToken );

        Dataset payload = sparkSession
                .format( "com.databricks.spark.csv" )
                .option( "header", "true" )
                .load( path );

        Flight flight = Flight.newFlight()
                .addEntity( ENTITY_SET_TYPE )
                .to( ENTITY_SET_NAME )
                .key( ENTITY_SET_KEY )
                .addProperty( new FullQualifiedName( "iowastate.escene15" ) )
                .value( row -> get_geo( row.getAs( "NUMBER" ),
                        row.getAs( "STREET" ),
                        row.getAs( "UNIT" ),
                        row.getAs( "CITY" ),
                        row.getAs( "POSTCODE" ) ).getFormattedAddress() ).ok()
                .addProperty( new FullQualifiedName( "iowastate.escene11" ) )
                .value( row -> get_geo( row.getAs( "NUMBER" ),
                        row.getAs( "STREET" ),
                        row.getAs( "UNIT" ),
                        row.getAs( "CITY" ),
                        row.getAs( "POSTCODE" ) ) ).ok()

        Shuttle shuttle = new Shuttle( RetrofitFactory.Environment.LOCAL, jwtToken );
        shuttle.launch( flight, payload );

Example 20

Project: MegaSparkDiff   File:   Source Code and License 5 votes vote up
public void testCompareEqualTables()
    Pair,Dataset> pair = returnDiff("Test1","Test2");

    //the expectation is that both tables are equal
    if (pair.getLeft().count() != 0)"Expected 0 differences coming from left table." +
                "  Instead, found " + pair.getLeft().count() + ".");

    if (pair.getRight().count() != 0)"Expected 0 differences coming from right table." +
                "  Instead, found " + pair.getRight().count() + ".");

Example 21

Project: MegaSparkDiff   File:   Source Code and License 5 votes vote up
public void testCompareTable1IsSubset()
    Pair,Dataset> pair = returnDiff("Test4","Test1");

    //the expectation is that table1 is a complete subset of table2
    if (pair.getLeft().count() != 0)"Expected 0 differences coming from left table." +
                "  Instead, found " + pair.getLeft().count() + ".");

    if (pair.getRight().count() != 5)"Expected 5 differences coming from right table." +
                "  Instead, found " + pair.getRight().count() + ".");

Example 22

Project: rdf2x   File:   Source Code and License 5 votes vote up
 * Persist predicate metadata table storing all predicates.
public void writePredicateMetadata() {

    // create the schema
    List fields = new ArrayList<>();
    fields.add(DataTypes.createStructField(PREDICATE_ID, DataTypes.IntegerType, false));
    fields.add(DataTypes.createStructField(PREDICATE_URI, DataTypes.StringType, false));
    fields.add(DataTypes.createStructField(PREDICATE_LABEL, DataTypes.StringType, true));
    StructType schema = DataTypes.createStructType(fields);

    List> indexes = new ArrayList<>();
    indexes.add(new Tuple2<>(PREDICATES_TABLE_NAME, PREDICATE_URI));

    List> primaryKeys = new ArrayList<>();
    primaryKeys.add(new Tuple2<>(PREDICATES_TABLE_NAME, PREDICATE_ID));

    final IndexMap predicateIndex = rdfSchema.getPredicateIndex();
    final Map uriLabels = rdfSchema.getUriLabels();
    // create table rows
    List rows = predicateIndex.getValues().stream()
            .map(uri -> {
                Object[] valueArray = new Object[]{
                return RowFactory.create(valueArray);

    // create and write the META_Predicates dataframe
    DataFrame df = sql.createDataFrame(rows, schema);
    persistor.writeDataFrame(PREDICATES_TABLE_NAME, df);

Example 23

Project: bunsen   File:   Source Code and License 5 votes vote up
public void testSnomedHasAncestor() {

  Dataset results = spark.sql("select id from test_snomed_cond "
      + "where in_valueset(code, 'diabetes')");

  Assert.assertEquals(1, results.count());
  Assert.assertEquals("diabetes", results.head().get(0));

Example 24

Project: integrations   File:   Source Code and License 5 votes vote up
public static String getFirstName( Row row ) {
    String name = row.getAs( "NAME" );
    if ( StringUtils.isBlank( name ) ) {
        return null;
    Matcher m = p.matcher( name );
    if ( !m.matches() ) {
        return null;
    return (String) 2 );

Example 25

Project: integrations   File:   Source Code and License 5 votes vote up
public static String getLastName( Row row ) {
    String name = row.getAs( "NAME" );
    if ( StringUtils.isBlank( name ) ) {
        return null;
    Matcher m = p.matcher( name );
    if ( !m.matches() ) {
        return null;
    return (String) 1 );

Example 26

Project: integrations   File:   Source Code and License 5 votes vote up
public static String getFirstName( Row row ) {
    String name = row.getAs( "NAME" );
    if ( StringUtils.isBlank( name ) ) {
        return null;
    Matcher m = p.matcher( name );
    if ( !m.matches() ) {
        return null;
    return (String) 2 );

Example 27

Project: integrations   File:   Source Code and License 5 votes vote up
public static String getLastName( Row row ) {
    String name = row.getAs( "NAME" );
    if ( StringUtils.isBlank( name ) ) {
        return null;
    Matcher m = p.matcher( name );
    if ( !m.matches() ) {
        return null;
    return (String) 1 );

Example 28

Project: bunsen   File:   Source Code and License 5 votes vote up
 * Writes value records to a table. This class ensures the columns and partitions are mapped
 * properly, and is a workaround similar to the problem described here.
 * @param values a dataset of value records
 * @param tableName the table to write them to
private static void writeValuesToTable(Dataset values, String tableName) {

  // Note the last two columns here must be the partitioned-by columns in order and in lower case
  // for Spark to properly match them to the partitions
  Dataset orderColumnDataset ="system",


Example 29

Project: rdf2x   File:   Source Code and License 5 votes vote up
public void testWriteRelationTablesWithoutPredicateIndex() throws IOException {
    InstanceRelationWriter writer = new InstanceRelationWriter(config
            .setStorePredicate(false), jsc(), persistor, rdfSchema);
    writer.writeRelationTables(getTestRelationSchema(), getTestRelations());

    List rows = new ArrayList<>();
    rows.add(RowFactory.create(1L, 3L));
    rows.add(RowFactory.create(2L, 3L));

    DataFrame result = this.result.values().iterator().next();
    assertEquals("Expected schema of A_B was extracted", getExpectedSchemaOfAB(false, false), result.schema());
    assertRDDEquals("Expected rows of A_B were extracted", jsc().parallelize(rows), result.toJavaRDD());

Example 30

Project: HiveUnit   File:   Source Code and License 5 votes vote up
static Tabular tabularDataset(Dataset ds){
    return new Tabular(){
        public int          numRows()                   { return (int)ds.count(); }
        public int          numCols()                   { return ds.columns().length; }
        public List headers()                   { return Arrays.asList(ds.columns()) ; }
        public String val(int rowNum, int colNum) {
            int ri = rowNum-1;
            int ci = colNum-1;
            Object v = ds.collectAsList().get(ri).get(ci);
            return v == null ? "" : v.toString(); }

Example 31

Project: spark-cassandra-poc   File:   Source Code and License 5 votes vote up
private void writeUserViewCountResultToCassandra(List collectAsList, String tableName,
		Connection connection) throws QueryExecutionException {
	connection.execute(new CassandraQuery("DROP table if exists wootag." + tableName + ";"));
	connection.execute(new CassandraQuery("create table IF NOT EXISTS wootag." + tableName + " ("
			+ " user_id text, view_duration_in_second int, view_counts int,"
			+ " PRIMARY KEY ( user_id, view_duration_in_second )" + ");"));

	connection.insertRows(collectAsList, tableName,
			Arrays.asList("user_id", "view_duration_in_second", "view_counts"));
	System.out.println("Output size : " + collectAsList.size());

Example 32

Project: rdf2x   File:   Source Code and License 5 votes vote up
 * Write metadata describing relation tables
 * @param relationSchema the relation schema
public void writeRelationMetadata(RelationSchema relationSchema) {
    // create the schema
    List fields = new ArrayList<>();
    fields.add(DataTypes.createStructField(RELATIONS_NAME, DataTypes.StringType, false));
    fields.add(DataTypes.createStructField(RELATIONS_FROM_NAME, DataTypes.StringType, true));
    fields.add(DataTypes.createStructField(RELATIONS_TO_NAME, DataTypes.StringType, true));
    fields.add(DataTypes.createStructField(RELATIONS_PREDICATE_ID, DataTypes.IntegerType, true));

    // create table rows
    List rows = relationSchema.getTables().stream()
            .map(table -> {
                RelationPredicateFilter predicateFilter = table.getPredicateFilter();
                RelationEntityFilter entityFilter = table.getEntityFilter();
                Object[] valueArray = new Object[]{
                        entityFilter == null ? null : entityFilter.getFromTypeName(),
                        entityFilter == null ? null : entityFilter.getToTypeName(),
                        predicateFilter == null ? null : rdfSchema.getPredicateIndex().getIndex(predicateFilter.getPredicateURI())
                return RowFactory.create(valueArray);

    StructType schema = DataTypes.createStructType(fields);

    // add index for each field
    List> indexes =
            .map(field -> new Tuple2<>(RELATIONS_TABLE_NAME,

    // create and write the META_Relations dataframe
    DataFrame df = sql.createDataFrame(rows, schema);
    persistor.writeDataFrame(RELATIONS_TABLE_NAME, df);

Example 33

Project: integrations   File:   Source Code and License 5 votes vote up
public static String safeDOBParse( Row row ) {
    String dob = row.getAs( "birthd" );
    if ( dob == null ) {
        return null;
    if ( dob.contains( "#" ) ) {
        return null;
    return bdHelper.parse( dob );

Example 34

Project: uberscriptquery   File:   Source Code and License 5 votes vote up
public Dataset execute(SparkSession sparkSession, StatementAssignment statementAssignment, CredentialProvider credentialManager) {"Running query by sql jdbc: " + statementAssignment);
    Map queryConfig = statementAssignment.getQueryConfig();
    String connectionString = queryConfig.get(StatementAssignment.QUERY_CONFIG_CONNECTION_STRING);
    String passwordFile = queryConfig.get(StatementAssignment.QUERY_CONFIG_PASSWORD_FILE);
    String passwordEntry = queryConfig.get(StatementAssignment.QUERY_CONFIG_PASSWORD_ENTRY);
    String password = credentialManager.getPassword(passwordFile, passwordEntry);
    if (password != null) {
        connectionString = connectionString.replace("[password]", password);
    return SparkUtils.readJdbc(connectionString, statementAssignment.getQueryStatement(), sparkSession);

Example 35

Project: rdf2x   File:   Source Code and License 5 votes vote up
private JavaRDD getExpectedRowsOfMetaPredicates() {
    List rows = new ArrayList<>();
    rows.add(RowFactory.create(predicateIndex.getIndex(""), "", "Knows label"));
    rows.add(RowFactory.create(predicateIndex.getIndex(""), "", "Likes label"));
    rows.add(RowFactory.create(predicateIndex.getIndex(""), "", "Name label"));
    rows.add(RowFactory.create(predicateIndex.getIndex(""), "", null));
    return jsc().parallelize(rows);

Example 36

Project: net.jgp.labs.spark.datasources   File:   Source Code and License 5 votes vote up
private boolean start() {
    SparkSession spark = SparkSession.builder()
            .appName("EXIF to Dataset")
    String importDirectory = "/Users/jgp/Pictures";
    Dataset df =
            .option("recursive", "true")
            .option("limit", "100000")
            .option("extensions", "jpg,jpeg")
    // We can start analytics
    df = df
    System.out.println("I have imported " + df.count() + " photos.");
    return true;

Example 37

Project: rdf2x   File:   Source Code and License 5 votes vote up
private DataFrame getTestRDD() {
    SQLContext sql = new SQLContext(jsc());
    List rdd = new ArrayList<>();

    // cycle one -> two -> three -> one
    rdd.add(RowFactory.create(0, uriIndex.getIndex(""), 1L, uriIndex.getIndex(""), 2L));
    rdd.add(RowFactory.create(0, uriIndex.getIndex(""), 2L, uriIndex.getIndex(""), 3L));
    rdd.add(RowFactory.create(0, uriIndex.getIndex(""), 3L, uriIndex.getIndex(""), 1L));

    // one -> four, four -> one
    rdd.add(RowFactory.create(0, uriIndex.getIndex(""), 1L, uriIndex.getIndex(""), 4L));
    rdd.add(RowFactory.create(0, uriIndex.getIndex(""), 4L, uriIndex.getIndex(""), 1L));

    // five -> one
    rdd.add(RowFactory.create(0, uriIndex.getIndex(""), 5L, uriIndex.getIndex(""), 1L));

    return sql.createDataFrame(rdd, new StructType()
            .add("predicateIndex", DataTypes.IntegerType, false)
            .add("fromTypeIndex", DataTypes.IntegerType, false)
            .add("fromID", DataTypes.LongType, false)
            .add("toTypeIndex", DataTypes.IntegerType, false)
            .add("toID", DataTypes.LongType, false)

Example 38

Project: rdf2x   File:   Source Code and License 5 votes vote up
private JavaRDD getExpectedRowsOfEAV() {
    List rows = new ArrayList<>();
    rows.add(RowFactory.create(1L, uriIndex.getIndex(""), "STRING", null, "First A 1"));
    rows.add(RowFactory.create(1L, uriIndex.getIndex(""), "STRING", null, "First A 2"));
    rows.add(RowFactory.create(2L, uriIndex.getIndex(""), "STRING", null, "Second A"));
    rows.add(RowFactory.create(3L, uriIndex.getIndex(""), "INTEGER", null, "100"));
    rows.add(RowFactory.create(3L, uriIndex.getIndex(""), "STRING", "en", "First B"));
    return jsc().parallelize(rows);

Example 39

Project: bunsen   File:   Source Code and License 5 votes vote up
 * Writes mapping records to a table. This class ensures the columns and partitions are mapped
 * properly, and is a workaround similar to the problem described here.
 * @param mappings a dataset of mapping records
 * @param tableName the table to write them to
private static void writeMappingsToTable(Dataset mappings,
    String tableName) {

  // Note the last two columns here must be the partitioned-by columns
  // in order and in lower case for Spark to properly match
  // them to the partitions.
  Dataset orderedColumnDataset ="sourceValueSet",


Example 40

Project: MegaSparkDiff   File:   Source Code and License 5 votes  
public void testCompareJDBCTableToTextFile()

    AppleTable leftAppleTable = SparkFactory.parallelizeJDBCSource("org.hsqldb.jdbc.JDBCDriver",
            "(select * from Test4)", "table1");

    String file2Path = this.getClass().getClassLoader().
    AppleTable rightAppleTable = SparkFactory.parallelizeTextSource(file2Path,"table2");

    Pair,Dataset> pair = SparkCompare.compareAppleTables(leftAppleTable, rightAppleTable);

    //the expectation is that both tables are completely different
    if (pair.getLeft().count() != 0)"Expected 0 differences coming from left table." +
                "  Instead, found " + pair.getLeft().count() + ".");

    if (pair.getRight().count() != 1)"Expected 1 difference coming from right table." +
                "  Instead, found " + pair.getRight().count() + ".");



