阅读更多
Migrate Data from MySQL to DynamoDB
Directly writes to the DynamoDB
https://github.com/audienceproject/spark-dynamodb
I was thinking this should work, but it does not working at reading
%spark.dep
z.load("mysql:mysql-connector-java:5.1.47")
z.load("com.github.traviscrawford:spark-dynamodb:0.0.13")
z.load("com.audienceproject:spark-dynamodb_2.11:0.4.1")
This reading does not work
import com.audienceproject.spark.dynamodb.implicits._
val accountDF = spark.read.option("region","us-west-1").dynamodb("account-int-accounts")
accountDF.printSchema()
accountDF.show(2)
This reading work
import com.github.traviscrawford.spark.dynamodb._
val accountDF = sqlContext.read.dynamodb("us-west-1", "account-int-accounts")
accountDF.printSchema()
accountDF.show(1)
This is working for writing data, but I do not think it works well with the capacity
%spark.dep
z.load("mysql:mysql-connector-java:5.1.47")
z.load("com.github.traviscrawford:spark-dynamodb:0.0.13")
z.load("com.audienceproject:spark-dynamodb_2.11:0.4.1")
z.load("com.google.guava:guava:14.0.1")
import com.github.traviscrawford.spark.dynamodb._
val accountDF = sqlContext.read.dynamodb("us-west-1", "account-int-accounts")
accountDF.printSchema()
accountDF.show(1)
import com.audienceproject.spark.dynamodb.implicits._
accountDF.write.option("region", "us-west-1").dynamodb("account-int-accounts2")
The Read Works as Well
import com.audienceproject.spark.dynamodb.implicits._
val dynamoDF = spark.read.option("region", "us-west-1").dynamodb("account-int-accounts")
dynamoDF.printSchema()
dynamoDF.show(5)
DynamoDB Format and AWS Command
https://github.com/lmammino/json-dynamo-putrequest
First of all, prepare the JSON file on the server, usually I will download that
> hdfs dfs -get hdfs://localhost:9000/mysqltodynamodb/account2 ./account2
Find the JSON file account2.json
Install NodeJS if it is not on the system
> sudo apt install nodejs
> sudo apt install npm
> node --version && npm --version
v8.10.0
3.5.2
Install the software
> sudo npm install --global json-dynamo-putrequest
Check installation
> json-dynamo-putrequest --help
> json-dynamo-putrequest --version
1.0.0
Command
> json-dynamo-putrequest account-int-accounts2 --output account-dynamo.json < account2.json
Error: Input data needs to be an array
Add [ and ], replace } to }, try the data again.
> json-dynamo-putrequest account-int-accounts2 --output account-dynamo.json < account2.json
Output saved in /home/ubuntu/data/account-dynamo.json
File is ready as account-dynamo.json
https://github.com/lmammino/json-dynamo-putrequest
Then follow documents to import data into Table
https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/SampleData.LoadData.html
Create a table from the console, execute the import command
> aws dynamodb batch-write-item --request-items file:///home/ubuntu/data/account-dynamo.json
at 'requestItems' failed to satisfy constraint: Map value must satisfy constraint: [Member must have length less than or equal to 25, Member must have length greater than or equal to 1]
Haha, in the docs, all the sample are less than 25 items.
Directly write NodeJS to parse the JSON file and do the import works
"use strict";
// How to run
// node dynamodb-scripts/import-devices-to-dynamodb.js {ENV} ./css_devices_only_csv.txt
// eg: node dynamodb-scripts/import-devices-to-dynamodb.js int ./css_devices_only_csv.txt
var importDevicesToDynamo;
process.env.AWS_SDK_LOAD_CONFIG = true;
(function (importDevicesToDynamo) {
const fs = require('fs');
const babyparse = require("babyparse");
const AWS = require("aws-sdk");
const log4js = require('log4js');
const logger = log4js.getLogger();
const sleep = require('sleep');
const env = process.argv[2]; // Must be int, stage or prod
const csvFilePath = process.argv[3];
const config = {
delimiter: ',',
newline: "",
quoteChar: '"',
header: true,
dynamicTyping: false,
preview: 0,
encoding: "utf8",
worker: false,
comments: false,
skipEmptyLines: true
};
let tableName = `lifesize_device-${env}-devicePairingInfo`;
let accessKey = "";
let signatureKey = "";
let region = "";
let dynamoDbUrl = "";
//validate parameters
if (!env) {
console.log("\nMust pass in environment for 1st argument. Must be one of 'int, 'stage' or 'prod'");
console.log("\nUsage - node dynamodb-scripts/import-devices-to-dynamodb.js {env} {csv path/file } ");
console.log("\nExample - node dynamodb-scripts/import-devices-to-dynamodb.js int ./css_devices_only_csv.txt \n");
process.exit(1);
}
if (!csvFilePath) {
console.log("\nMust pass in csvFilePath for 2nd argument.");
console.log("\nUsage - node dynamodb-scripts/import-devices-to-dynamodb.js {env} {csv path/file } ");
console.log("\nExample - node dynamodb-scripts/import-devices-to-dynamodb.js int ./css_devices_only_csv.txt \n");
process.exit(2);
}
console.log("Env = " + env);
console.log("File to import = " + csvFilePath);
let content = fs.readFileSync(csvFilePath, config);
let parsed = babyparse.parse(content, config);
let rows = JSON.parse(JSON.stringify(parsed.data));
console.log("Row count = " + Object.keys(rows).length);
let _id;
// For the batch size of 10, we need to temporarily change the write capacity units to 50 in DynaoDB for the appropriate table, then reset to default when script is finished
let size = 10;
console.log("dynamoDbURL = " + dynamoDbUrl);
console.log("tableName = " + tableName);
var credentials = new AWS.SharedIniFileCredentials();
AWS.config.credentials = credentials;
const dynamoDb = new AWS.DynamoDB.DocumentClient();
let uniqueSerialNumbers = [];
for (let i = 0; i < rows.length; i += size) {
// Slice the array into smaller arrays of 10,000 items
let smallarray = rows.slice(i, i + size);
console.log("i = " + i + " serialNumber = " + smallarray[0].serialNumber);
let batchItems = smallarray.map(function (item) {
try {
const serialNumber = item.serialNumber;
if (uniqueSerialNumbers.includes(serialNumber)) {
//console.log("System ignore duplicated record", item);
return null;
} else {
uniqueSerialNumbers.push(serialNumber);
}
// Replace empty string values with null. DynamoDB doesn't allow empty strings, will throw error on request.
for (let items in item) {
let value = item[items];
if (value === undefined || value === "") {
item[items] = null;
}
if (items == "enabled") {
if (value === "f") {
item[items] = false;
} else if (value === "t") {
item[items] = true;
}
}
}
item.adminAccountUUID = null;
item.sessionID = null;
item.pairingCodeCreateTime = null;
if(item.systemName === null){
item.systemName = item.userExtension.toString()
}
if(item.pairingstatus === 'DEFAULT'){
item.pairingstatus = "COMPLETE"
}
if(item.plaform === 'GRAPHITE'){
item.deviceUUID = item.serialNumber
}
if(item.userExtension && !item.extension) {
item.extension = item.userExtension.toString();
console.log(`++++++++++++++++++++++++++++++++++++`);
}
let params = {
PutRequest: { Item: JSON.parse(JSON.stringify(item)) }
};
console.log("params = " + JSON.stringify(params, null, 2));
return params;
}
catch (error) {
console.log("**** ERROR processing file: " + error);
}
}).filter((obj) =>
obj !== null
);
if (batchItems.length === 0) {
console.log("System filter all the dupicate data, nothing left");
continue;
}
let batchRequestParams = '{"RequestItems":{"' + tableName + '":' + JSON.stringify(batchItems) + '},"ReturnConsumedCapacity":"TOTAL","ReturnItemCollectionMetrics": "SIZE"}';
console.log("batchRequestParams ============================================================ ");// + batchRequestParams);
callDynamo(batchRequestParams).then(function (data) {
sleep.msleep(100);
}).catch(console.error);
}
function callDynamo(batchRequestParams) {
return new Promise(function (resolve, reject) {
dynamoDb.batchWrite(JSON.parse(batchRequestParams), function (err, data) {
try {
if (err) {
logger.error(`Error - ${err} = Trying again:`);
sleep.msleep(100);
dynamoDb.batchWrite(JSON.parse(batchRequestParams), function (err, data) {
try {
if (err) {
//console.log("------------- data is beauty:", batchRequestParams);
logger.error("Unable to add item a 2nd time, Error:", err);
return reject(err);
}
else {
logger.debug("2nd PutItem succeeded");
resolve(data);
}
}
catch (error) {
//console.log("------------- data is here:", batchRequestParams);
console.log("error calling DynamoDB - " + error);
return reject(err);
}
});
}
else {
logger.debug("PutItem succeeded");
resolve(data);
}
}
catch (error) {
console.log("error calling DynamoDB - " + error);
return reject(err);
}
});
});
}
})(importDevicesToDynamo || (importDevicesToDynamo = {}));
References:
https://github.com/audienceproject/spark-dynamodb
https://stackoverflow.com/questions/37444607/writing-from-spark-to-dynamodb
https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/SampleData.LoadData.html
https://github.com/lmammino/json-dynamo-putrequest