Issue
relationalize
is coercing timestamp columns of nested child tables to string.
Note I've tested this using Glue 4.0 locally with the AWS provided amazon/aws-glue-libs:glue_libs_4.0.0_image_01
Docker image, however it is derived from a live Glue issue we have. Our real example is much more complicated with more levels of nesting, and as such re-casting to timestamp is a possible workaround but really not ideal.
Minimal Reproduction Script
from pyspark.context import SparkContext
from pyspark import SparkConf
from awsglue.context import GlueContext
from awsglue.dynamicframe import DynamicFrame
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, ArrayType
from datetime import datetime
sc = SparkContext.getOrCreate(SparkConf())
# df = spark.createDataFrame(data=data, schema=schema)
glueContext = GlueContext(sc)
spark = glueContext.spark_session
schema: StructType = StructType(
[
StructField("parent_id", StringType(), True),
StructField("timestamp_col", TimestampType(), True),
StructField(
"child",
ArrayType(
StructType(
[
StructField("child_id", StringType(), True),
StructField("child_field", StringType(), True),
StructField("child_timestamp", TimestampType(), True),
]
),
True,
),
True,
),
]
)
input_data = {
"parent_id": "test_id",
"updatedAt": datetime(2022, 4, 1, 20, 50, 2, 515000),
"child": [
{
"_id": "test_child_id",
"child_field": "test_child_field",
"child_timestamp": datetime(2022, 4, 6, 19, 0, 0),
},
],
}
data = [tuple(input_data.values())]
df = spark.createDataFrame(data=data, schema=schema)
df.printSchema()
"""
root
|-- parent_id: string (nullable = true)
|-- timestamp_col: timestamp (nullable = true)
|-- child: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- child_id: string (nullable = true)
| | |-- child_field: string (nullable = true)
| | |-- child_timestamp: timestamp (nullable = true) <- Correct type
"""
dynamic_frame = DynamicFrame.fromDF(df, glueContext, "test")
dfc = dynamic_frame.relationalize("root", "./tmp/")
root = dfc.select("root").toDF()
root.printSchema()
"""
root
|-- parent_id: string (nullable = true)
|-- timestamp_col: timestamp (nullable = true) <- Correct type
|-- child: long (nullable = true)
"""
viewers = dfc.select("root_child").toDF()
viewers.printSchema()
"""
root
|-- id: long (nullable = true)
|-- index: integer (nullable = true)
|-- child.val.child_field: string (nullable = true)
|-- child.val.child_timestamp: string (nullable = true) <- Only incorrect type
"""
As called out in the comments with the printed output, child.val.child_timestamp
has been coerced to a string. Its value is just the string representation of the Timestamp.
Question
As a user it looks like a bug as the relationalize
documentation makes no mention of modifying datatypes. Is there a way to control this behavior or prevent it?