Source code for etl_toolkit.expressions.regex

from functools import reduce
from datetime import date

from pyspark.sql.types import DoubleType
from pyspark.sql import functions as F, Column, DataFrame
from yipit_databricks_utils.helpers.telemetry import track_usage

from etl_toolkit.expressions.core import normalize_column
from etl_toolkit.expressions import boolean


[docs] def rlike_any(col: Column | str, patterns: list[str]) -> Column: """ Function that validates if a column matches any of the provided regex patterns. This is equivalent to doing the following logic in spark: ['hello','test'] -> RLIKE 'hello|test'. It is recommended using this expression over building a long regex string yourself as it can be hard to maintain and/or read. The return value is a boolean column that is True if any of the provided patterns are satisified, and False otherwise. :param col: The input column to compare the ``patterns`` against. It should be a column of string type. If a string is provided, it is referenced as a Column. :param patterns: A list of regex patterns to match the input ``col`` against. This matching is done using the ``.rlike`` method. Examples ----------- .. code-block:: python :caption: Using E.rlike_any to check a series of regex patterns. from etl_toolkit import E, F df = spark.createDataFrame([ {"input": "Outback"}, {"input": "outbacksthouse"}, ]) display( df.withColumn("output", E.rlike_any("input", ["outbackst", "outback s"])) ) +------------------------+----------------------+ |input |output | +------------------------+----------------------+ |Outback |False | +------------------------+----------------------+ |outbacksthouse |True | +------------------------+----------------------+ """ col = normalize_column(col) expression = "|".join(patterns) return col.rlike(expression)
@track_usage
[docs] def rlike_all(col: Column | str, patterns: list[str]) -> Column: """ Function that validates if a column matches all the provided regex patterns. This is equivalent to doing the following logic in spark: ['hello','test'] -> RLIKE 'hello and RLIKE 'test'. It is recommended using this expression over building a long regex string yourself as it can be hard to maintain and/or read. The return value is a boolean column that is True if all the provided patterns are satisified, and False otherwise. :param col: The input column to compare the ``patterns`` against. It should be a column of string type. If a string is provided, it is referenced as a Column. :param patterns: A list of regex patterns to match the input ``col`` against. This matching is done using the ``.rlike`` method. Examples ----------- .. code-block:: python :caption: Using E.rlike_all to check a series of regex patterns. from etl_toolkit import E, F df = spark.createDataFrame([ {"input": "Outback"}, {"input": "outbacksthouse"}, ]) display( df.withColumn("output", E.rlike_all("input", ["outback", "house"])) ) +------------------------+----------------------+ |input |output | +------------------------+----------------------+ |Outback |False | +------------------------+----------------------+ |outbacksthouse |True | +------------------------+----------------------+ """ col = normalize_column(col) return boolean.all([col.rlike(pattern) for pattern in patterns])