1 Antwort
- Neueste
- Die meisten Stimmen
- Die meisten Kommentare
0
Using relationationalize to flatten and then join back sounds like a big overkill (also you are assuming the the joins know which columns to use)
I would just flatten using the DataFrame schema, which is that the Flatten visual transform does.
e.g.:
def flatten(df, maxLevels=0, separator="."):
cols_path = []
# Receives a list with the nested names in order and returns a Spark escaped column selector
col_path_select = lambda paths: '.'.join([escape_name(path) for path in paths])
def add_fields(fields, level=1, prefix=()):
for field in fields:
# If it's a Struct, and we are supposed to flatten it based on maxLevel config
if (type(field.dataType) == StructType) and ((level <= maxLevels) or (maxLevels == 0)):
add_fields(field.dataType.fields, level + 1, prefix + (field.name,))
else:
# Add the column path to the list, each element of the path list represents on leve
cols_path.append(list(prefix) + [field.name])
add_fields(self.schema)
# Enforce nested fields to use the full name with the alias, otherwise it will just use the last name
col_list = [self[col_path_select(cols_path)].alias(separator.join(cols_path)) for cols_path in cols_path]
return self.select(col_list)