I'm syncing data written to S3 using Apache Hudi with Hive & Glue.
Hudi options:
hudi_options:
'hoodie.table.name': mytable
'hoodie.datasource.write.recordkey.field': Id
'hoodie.datasource.write.partitionpath.field': date
'hoodie.datasource.write.table.name': mytable
'hoodie.datasource.write.operation': upsert
'hoodie.datasource.write.precombine.field': LastModifiedDate
'hoodie.datasource.hive_sync.enable': true
'hoodie.datasource.hive_sync.partition_fields': date
'hoodie.datasource.hive_sync.database': hudi_lake_dev
'hoodie.datasource.hive_sync.table': mytable
EMR Configurations:
...
{
"Classification": "yarn-site",
"Properties": {
"yarn.nodemanager.vmem-check-enabled": "false",
"yarn.log-aggregation-enable": "true",
"yarn.log-aggregation.retain-seconds": "-1",
"yarn.nodemanager.remote-app-log-dir": config[
"yarn_agg_log_uri_s3_path"
].format(current_date),
},
"Configurations": [],
},
{
"Classification": "spark-hive-site",
"Properties": {
"hive.metastore.client.factory.class": "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory" # noqa
},
},
{
"Classification": "presto-connector-hive",
"Properties": {"hive.metastore.glue.datacatalog.enabled": "true"},
},
{
"Classification": "hive-site",
"Properties": {
"hive.metastore.client.factory.class": "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory", # noqa
"hive.metastore.schema.verification": "false",
},
},
...
Getting the following error:
Traceback (most recent call last):
File "/home/hadoop/my_pipeline.py", line 31, in <module>
...
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/readwriter.py", line 1109, in save
File "/usr/lib/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py", line 1305, in __call__
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/utils.py", line 111, in deco
File "/usr/lib/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py", line 328, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o84.save.
: org.apache.hudi.hive.HoodieHiveSyncException: Got runtime exception when hive syncing
at org.apache.hudi.hive.HiveSyncTool.<init>(HiveSyncTool.java:74)
at org.apache.hudi.HoodieSparkSqlWriter$.syncHive(HoodieSparkSqlWriter.scala:391)
at org.apache.hudi.HoodieSparkSqlWriter$.$anonfun$metaSync$4(HoodieSparkSqlWriter.scala:440)
at org.apache.hudi.HoodieSparkSqlWriter$.$anonfun$metaSync$4$adapted(HoodieSparkSqlWriter.scala:436)
at scala.collection.mutable.HashSet.foreach(HashSet.scala:79)
at org.apache.hudi.HoodieSparkSqlWriter$.metaSync(HoodieSparkSqlWriter.scala:436)
at org.apache.hudi.HoodieSparkSqlWriter$.commitAndPerformPostOperations(HoodieSparkSqlWriter.scala:497)
at org.apache.hudi.HoodieSparkSqlWriter$.write(HoodieSparkSqlWriter.scala:223)
at org.apache.hudi.DefaultSource.createRelation(DefaultSource.scala:145)
at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:46)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:70)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:68)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:90)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:194)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:232)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:229)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:190)
at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:134)
at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:133)
at org.apache.spark.sql.DataFrameWriter.$anonfun$runCommand$1(DataFrameWriter.scala:989)
at org.apache.spark.sql.catalyst.QueryPlanningTracker$.withTracker(QueryPlanningTracker.scala:107)
at org.apache.spark.sql.execution.SQLExecution$.withTracker(SQLExecution.scala:232)
at org.apache.spark.sql.execution.SQLExecution$.executeQuery$1(SQLExecution.scala:110)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:135)
at org.apache.spark.sql.catalyst.QueryPlanningTracker$.withTracker(QueryPlanningTracker.scala:107)
at org.apache.spark.sql.execution.SQLExecution$.withTracker(SQLExecution.scala:232)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:135)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:253)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:134)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:68)
at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:989)
at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:438)
at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:415)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:293)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.hudi.hive.HoodieHiveSyncException: Failed to create HiveMetaStoreClient
at org.apache.hudi.hive.HoodieHiveClient.<init>(HoodieHiveClient.java:93)
at org.apache.hudi.hive.HiveSyncTool.<init>(HiveSyncTool.java:69)
... 46 more
Caused by: org.apache.hadoop.hive.ql.metadata.HiveException: org.apache.hadoop.hive.ql.metadata.HiveException: MetaException(message:Unable to instantiate a metastore client factory com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory due to: java.lang.ClassNotFoundException: Class com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory not found)
at org.apache.hadoop.hive.ql.metadata.Hive.registerAllFunctionsOnce(Hive.java:239)
at org.apache.hadoop.hive.ql.metadata.Hive.<init>(Hive.java:402)
at org.apache.hadoop.hive.ql.metadata.Hive.create(Hive.java:335)
at org.apache.hadoop.hive.ql.metadata.Hive.getInternal(Hive.java:315)
at org.apache.hadoop.hive.ql.metadata.Hive.get(Hive.java:291)
at org.apache.hudi.hive.HoodieHiveClient.<init>(HoodieHiveClient.java:91)
... 47 more
Caused by: org.apache.hadoop.hive.ql.metadata.HiveException: MetaException(message:Unable to instantiate a metastore client factory com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory due to: java.lang.ClassNotFoundException: Class com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory not found)
at org.apache.hadoop.hive.ql.metadata.Hive.getAllFunctions(Hive.java:3991)
at org.apache.hadoop.hive.ql.metadata.Hive.reloadFunctions(Hive.java:251)
at org.apache.hadoop.hive.ql.metadata.Hive.registerAllFunctionsOnce(Hive.java:234)
... 52 more
Caused by: MetaException(message:Unable to instantiate a metastore client factory com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory due to: java.lang.ClassNotFoundException: Class com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory not found)
at org.apache.hadoop.hive.ql.metadata.HiveUtils.createMetaStoreClientFactory(HiveUtils.java:525)
at org.apache.hadoop.hive.ql.metadata.HiveUtils.createMetaStoreClient(HiveUtils.java:506)
at org.apache.hadoop.hive.ql.metadata.Hive.getMSC(Hive.java:3746)
at org.apache.hadoop.hive.ql.metadata.Hive.getMSC(Hive.java:3726)
at org.apache.hadoop.hive.ql.metadata.Hive.getAllFunctions(Hive.java:3988)
... 54 more
22/01/12 16:37:53 INFO SparkContext: Invoking stop() from shutdown hook
Is there an option to apply these changes? I've added the following after GLUE_JARS line:
HIVE_METASTORE=/usr/share/aws/aws-java-sdk/aws-java-sdk-bundle-1.12.31.jar:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-hive3-client-3.4.0.jar:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-hive3-client.jar:$HIVE_METASTORE
But I'm running into the same error.