Source code for etl_toolkit.expressions.regex

from functools import reduce
from datetime import date

from pyspark.sql.types import DoubleType
from pyspark.sql import functions as F, Column, DataFrame
from yipit_databricks_utils.helpers.telemetry import track_usage

from etl_toolkit.expressions.core import normalize_column
from etl_toolkit.expressions import boolean



[docs]
def rlike_any(col: Column | str, patterns: list[str]) -> Column:
    """
    Function that validates if a column matches any of the provided regex patterns.
    This is equivalent to doing the following logic in spark: ['hello','test'] -> RLIKE 'hello|test'.
    It is recommended using this expression over building a long regex string yourself as it can
    be hard to maintain and/or read.

    The return value is a boolean column that is True if any of the provided patterns are satisified, and False otherwise.

    :param col: The input column to compare the ``patterns`` against. It should be a column of string type. If a string is provided, it is referenced as a Column.
    :param patterns: A list of regex patterns to match the input ``col`` against. This matching is done using the ``.rlike`` method.

    Examples
    -----------

    .. code-block:: python
        :caption: Using E.rlike_any to check a series of regex patterns.

        from etl_toolkit import E, F

        df = spark.createDataFrame([
            {"input": "Outback"},
            {"input": "outbacksthouse"},
        ])

        display(
            df.withColumn("output", E.rlike_any("input", ["outbackst", "outback s"]))
        )

    +------------------------+----------------------+
    |input                   |output                |
    +------------------------+----------------------+
    |Outback                 |False                 |
    +------------------------+----------------------+
    |outbacksthouse          |True                  |
    +------------------------+----------------------+
    """
    col = normalize_column(col)
    expression = "|".join(patterns)

    return col.rlike(expression)



@track_usage

[docs]
def rlike_all(col: Column | str, patterns: list[str]) -> Column:
    """
    Function that validates if a column matches all the provided regex patterns.
    This is equivalent to doing the following logic in spark: ['hello','test'] -> RLIKE 'hello and RLIKE 'test'.
    It is recommended using this expression over building a long regex string yourself as it can
    be hard to maintain and/or read.

    The return value is a boolean column that is True if all the provided patterns are satisified, and False otherwise.

    :param col: The input column to compare the ``patterns`` against. It should be a column of string type. If a string is provided, it is referenced as a Column.
    :param patterns: A list of regex patterns to match the input ``col`` against. This matching is done using the ``.rlike`` method.

    Examples
    -----------

    .. code-block:: python
        :caption: Using E.rlike_all to check a series of regex patterns.

        from etl_toolkit import E, F

        df = spark.createDataFrame([
            {"input": "Outback"},
            {"input": "outbacksthouse"},
        ])

        display(
            df.withColumn("output", E.rlike_all("input", ["outback", "house"]))
        )

    +------------------------+----------------------+
    |input                   |output                |
    +------------------------+----------------------+
    |Outback                 |False                 |
    +------------------------+----------------------+
    |outbacksthouse          |True                  |
    +------------------------+----------------------+

    """
    col = normalize_column(col)

    return boolean.all([col.rlike(pattern) for pattern in patterns])