Source code for etl_toolkit.analyses.ordering

from pyspark.sql import functions as F, DataFrame, Column
from yipit_databricks_utils.helpers.telemetry import track_usage

from etl_toolkit.expressions.core import normalize_column


@track_usage
[docs] def shift_columns(df: DataFrame, priority_columns: list[str | Column]) -> DataFrame: """ Utility transformation that shifts any specified columns of the dataframe to the beginning of the schema This can be useful for performance optimization (ex: improving join performance on key columns and clustering). The re-ordered olumns can include new columns that don't already exist on the dataframe. These will be added in addition to any existing columns in the order of the ``priority_columns`` list :param df: The dataframe to reorder columns on :param priority_columns: A list of Column(s) or string(s) to shift to the beginning of the schema. If strings are provided, they will be resolved as Columns. Examples ----------- .. code-block:: python :caption: Re-order columns by specifying which columns should come at the beginning of the schema. All remaining columns will preserve the original order of the dataframe. from etl_toolkit import E, F, A from datetime import date df = spark.createDataFrame([ {"color": "red", "value": 1000, "date": date(2023, 1, 1)}, {"color": "blue", "value": 50, "date": date(2023, 1, 1)}, ]) display( A.shift_columns( df, priority_columns=["date"], ) ) +--------------+--------------+--------------+ |date |color |value | +--------------+--------------+--------------+ | 2023-01-01|red | 1000| +--------------+--------------+--------------+ | 2023-01-01|blue | 50| +--------------+--------------+--------------+ .. code-block:: python :caption: New derived columns can be introduced in the ``priority_columns`` list. These must be derived used other columns of the provided dataframe. from etl_toolkit import E, F, A from datetime import date df = spark.createDataFrame([ {"color": "red", "value": 1000, "date": date(2023, 1, 1)}, {"color": "blue", "value": 50, "date": date(2023, 1, 1)}, ]) display( A.shift_columns( df, priority_columns=["date", F.upper("color").alias("label")], ) ) +--------------+--------------+--------------+--------------+ |date |label |color |output | +--------------+--------------+--------------+--------------+ | 2023-01-01|RED |red | 1000| +--------------+--------------+--------------+--------------+ | 2023-01-01|BLUE |blue | 50| +--------------+--------------+--------------+--------------+ """ priority_columns = [normalize_column(col) for col in priority_columns] priority_column_names = df.select(priority_columns).columns return df.select( priority_columns + [column for column in df.columns if column not in priority_column_names] )