Source code for pyspark_util.column

from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType


[docs]def null_ratio(col_name): """ Return the null ratio of the given column. Parameters ---------- col_name : str column name Returns ------- column Null ratio. Examples -------- >>> df = spark.createDataFrame([ ... (1,), ... (2,), ... (None,), ... (None,), ... ], ['x']) >>> df.select(psu.null_ratio('x')).show() # doctest: +NORMALIZE_WHITESPACE +---+ | x| +---+ |0.5| +---+ """ return F.mean(F.col(col_name).isNull().cast(IntegerType())).alias(col_name)
[docs]def blank_ratio(col_name, include_null=False): """ Return the null ratio of the given column. Parameters ---------- col_name : str column name include_null : bool, default False If True, the blank ratio is calculated including ``NULL`` rows. Returns ------- column Blank ratio. Examples -------- By default, ``NULL`` is ignored. >>> df = spark.createDataFrame([ ... ('a',), ... ('b',), ... ('',), ... ('',), ... (None,), ... ], ['x']) >>> df.select(psu.blank_ratio('x')).show() # doctest: +NORMALIZE_WHITESPACE +---+ | x| +---+ |0.5| +---+ With ``include_null=True``, ``NULL`` is included in the calculation. >>> df = spark.createDataFrame([ ... ('a',), ... ('b',), ... ('',), ... ('',), ... (None,), ... ], ['x']) >>> df.select(psu.blank_ratio('x', include_null=True)).show() # doctest: +NORMALIZE_WHITESPACE +---+ | x| +---+ |0.4| +---+ """ is_blank = F.col(col_name) == '' if include_null: # fill NULL with False filled = F.when(F.col(col_name).isNull(), False).otherwise(is_blank) return F.mean(filled.cast(IntegerType())).alias(col_name) else: return F.mean(is_blank.cast(IntegerType())).alias(col_name)
[docs]def is_unique(col_name): """ Return True if the given column is unique. Parameters ---------- col_name : str column name Returns ------- column is_unique Examples -------- >>> df = spark.createDataFrame([(1,), (2,), (3,)], ['x']) >>> df.select(psu.is_unique('x')).show() # doctest: +NORMALIZE_WHITESPACE +----+ | x| +----+ |true| +----+ >>> df = spark.createDataFrame([(1,), (2,), (2,)], ['x']) >>> df.select(psu.is_unique('x')).show() # doctest: +NORMALIZE_WHITESPACE +-----+ | x| +-----+ |false| +-----+ >>> df = spark.createDataFrame([(1,), (2,), (3,), (None,)], ['x']) >>> df.select(psu.is_unique('x')).show() # doctest: +NORMALIZE_WHITESPACE +----+ | x| +----+ |true| +----+ >>> df = spark.createDataFrame([(1,), (2,), (3,), (None,), (None,)], ['x']) >>> df.select(psu.is_unique('x')).show() # doctest: +NORMALIZE_WHITESPACE +-----+ | x| +-----+ |false| +-----+ """ return ( (F.count(col_name) == F.countDistinct(col_name)) & (F.count(F.when(F.col(col_name).isNull(), 1).otherwise(None)) <= 1) ).alias(col_name)
[docs]def contains(col_name, pat): """ Test if pattern or regex is contained within a string. Parameters ---------- col_name : str column name pat : str character sequence or regular expression. Returns ------- column column of boolean values indicating whether the given pattern is contained within each element. Examples -------- >>> df = spark.createDataFrame([('abc',), ('123',), (None,)], ['x']) >>> df.select(psu.contains('x', 'abc')).show() # doctest: +NORMALIZE_WHITESPACE +-----+ | x| +-----+ | true| |false| | null| +-----+ >>> df = spark.createDataFrame([('abc',), ('123',), (None,)], ['x']) >>> df.select(psu.contains('x', r'[a-z]+')).show() # doctest: +NORMALIZE_WHITESPACE +-----+ | x| +-----+ | true| |false| | null| +-----+ """ return (F.regexp_extract(F.col(col_name), pat, 0) != '').alias(col_name)