Source code for etl_toolkit.expressions.regex
from functools import reduce
from datetime import date
from pyspark.sql.types import DoubleType
from pyspark.sql import functions as F, Column, DataFrame
from yipit_databricks_utils.helpers.telemetry import track_usage
from etl_toolkit.expressions.core import normalize_column
from etl_toolkit.expressions import boolean
[docs]
def rlike_any(col: Column | str, patterns: list[str]) -> Column:
"""
Function that validates if a column matches any of the provided regex patterns.
This is equivalent to doing the following logic in spark: ['hello','test'] -> RLIKE 'hello|test'.
It is recommended using this expression over building a long regex string yourself as it can
be hard to maintain and/or read.
The return value is a boolean column that is True if any of the provided patterns are satisified, and False otherwise.
:param col: The input column to compare the ``patterns`` against. It should be a column of string type. If a string is provided, it is referenced as a Column.
:param patterns: A list of regex patterns to match the input ``col`` against. This matching is done using the ``.rlike`` method.
Examples
-----------
.. code-block:: python
:caption: Using E.rlike_any to check a series of regex patterns.
from etl_toolkit import E, F
df = spark.createDataFrame([
{"input": "Outback"},
{"input": "outbacksthouse"},
])
display(
df.withColumn("output", E.rlike_any("input", ["outbackst", "outback s"]))
)
+------------------------+----------------------+
|input |output |
+------------------------+----------------------+
|Outback |False |
+------------------------+----------------------+
|outbacksthouse |True |
+------------------------+----------------------+
"""
col = normalize_column(col)
expression = "|".join(patterns)
return col.rlike(expression)
@track_usage
[docs]
def rlike_all(col: Column | str, patterns: list[str]) -> Column:
"""
Function that validates if a column matches all the provided regex patterns.
This is equivalent to doing the following logic in spark: ['hello','test'] -> RLIKE 'hello and RLIKE 'test'.
It is recommended using this expression over building a long regex string yourself as it can
be hard to maintain and/or read.
The return value is a boolean column that is True if all the provided patterns are satisified, and False otherwise.
:param col: The input column to compare the ``patterns`` against. It should be a column of string type. If a string is provided, it is referenced as a Column.
:param patterns: A list of regex patterns to match the input ``col`` against. This matching is done using the ``.rlike`` method.
Examples
-----------
.. code-block:: python
:caption: Using E.rlike_all to check a series of regex patterns.
from etl_toolkit import E, F
df = spark.createDataFrame([
{"input": "Outback"},
{"input": "outbacksthouse"},
])
display(
df.withColumn("output", E.rlike_all("input", ["outback", "house"]))
)
+------------------------+----------------------+
|input |output |
+------------------------+----------------------+
|Outback |False |
+------------------------+----------------------+
|outbacksthouse |True |
+------------------------+----------------------+
"""
col = normalize_column(col)
return boolean.all([col.rlike(pattern) for pattern in patterns])