Skip to content

Polars

clean_names

clean_names implementation for polars.

clean_names(df, strip_underscores=None, case_type='lower', remove_special=False, strip_accents=False, truncate_limit=None)

Clean the column names in a polars DataFrame.

clean_names can also be applied to a LazyFrame.

Examples:

>>> import polars as pl
>>> import janitor.polars
>>> df = pl.DataFrame(
...     {
...         "Aloha": range(3),
...         "Bell Chart": range(3),
...         "Animals@#$%^": range(3)
...     }
... )
>>> df
shape: (3, 3)
┌───────┬────────────┬──────────────┐
│ Aloha ┆ Bell Chart ┆ Animals@#$%^ │
│ ---   ┆ ---        ┆ ---          │
│ i64   ┆ i64        ┆ i64          │
╞═══════╪════════════╪══════════════╡
│ 0     ┆ 0          ┆ 0            │
│ 1     ┆ 1          ┆ 1            │
│ 2     ┆ 2          ┆ 2            │
└───────┴────────────┴──────────────┘
>>> df.clean_names(remove_special=True)
shape: (3, 3)
┌───────┬────────────┬─────────┐
│ aloha ┆ bell_chart ┆ animals │
│ ---   ┆ ---        ┆ ---     │
│ i64   ┆ i64        ┆ i64     │
╞═══════╪════════════╪═════════╡
│ 0     ┆ 0          ┆ 0       │
│ 1     ┆ 1          ┆ 1       │
│ 2     ┆ 2          ┆ 2       │
└───────┴────────────┴─────────┘

New in version 0.28.0

Parameters:

Name Type Description Default
strip_underscores str | bool

Removes the outer underscores from all column names. Default None keeps outer underscores. Values can be either 'left', 'right' or 'both' or the respective shorthand 'l', 'r' and True.

None
case_type str

Whether to make the column names lower or uppercase. Current case may be preserved with 'preserve', while snake case conversion (from CamelCase or camelCase only) can be turned on using "snake". Default 'lower' makes all characters lowercase.

'lower'
remove_special bool

Remove special characters from the column names. Only letters, numbers and underscores are preserved.

False
strip_accents bool

Whether or not to remove accents from the labels.

False
truncate_limit int

Truncates formatted column names to the specified length. Default None does not truncate.

None

Returns:

Type Description
DataFrame | LazyFrame

A polars DataFrame/LazyFrame.

Source code in janitor/polars/clean_names.py
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
@register_lazyframe_method
@register_dataframe_method
def clean_names(
    df: pl.DataFrame | pl.LazyFrame,
    strip_underscores: str | bool = None,
    case_type: str = "lower",
    remove_special: bool = False,
    strip_accents: bool = False,
    truncate_limit: int = None,
) -> pl.DataFrame | pl.LazyFrame:
    """
    Clean the column names in a polars DataFrame.

    `clean_names` can also be applied to a LazyFrame.

    Examples:
        >>> import polars as pl
        >>> import janitor.polars
        >>> df = pl.DataFrame(
        ...     {
        ...         "Aloha": range(3),
        ...         "Bell Chart": range(3),
        ...         "Animals@#$%^": range(3)
        ...     }
        ... )
        >>> df
        shape: (3, 3)
        ┌───────┬────────────┬──────────────┐
        │ Aloha ┆ Bell Chart ┆ Animals@#$%^ │
        │ ---   ┆ ---        ┆ ---          │
        │ i64   ┆ i64        ┆ i64          │
        ╞═══════╪════════════╪══════════════╡
        │ 0     ┆ 0          ┆ 0            │
        │ 1     ┆ 1          ┆ 1            │
        │ 2     ┆ 2          ┆ 2            │
        └───────┴────────────┴──────────────┘
        >>> df.clean_names(remove_special=True)
        shape: (3, 3)
        ┌───────┬────────────┬─────────┐
        │ aloha ┆ bell_chart ┆ animals │
        │ ---   ┆ ---        ┆ ---     │
        │ i64   ┆ i64        ┆ i64     │
        ╞═══════╪════════════╪═════════╡
        │ 0     ┆ 0          ┆ 0       │
        │ 1     ┆ 1          ┆ 1       │
        │ 2     ┆ 2          ┆ 2       │
        └───────┴────────────┴─────────┘

    !!! info "New in version 0.28.0"

    Args:
        strip_underscores: Removes the outer underscores from all
            column names. Default None keeps outer underscores. Values can be
            either 'left', 'right' or 'both' or the respective shorthand 'l',
            'r' and True.
        case_type: Whether to make the column names lower or uppercase.
            Current case may be preserved with 'preserve',
            while snake case conversion (from CamelCase or camelCase only)
            can be turned on using "snake".
            Default 'lower' makes all characters lowercase.
        remove_special: Remove special characters from the column names.
            Only letters, numbers and underscores are preserved.
        strip_accents: Whether or not to remove accents from
            the labels.
        truncate_limit: Truncates formatted column names to
            the specified length. Default None does not truncate.

    Returns:
        A polars DataFrame/LazyFrame.
    """  # noqa: E501
    return df.rename(
        lambda col: _clean_column_names(
            obj=col,
            strip_accents=strip_accents,
            strip_underscores=strip_underscores,
            case_type=case_type,
            remove_special=remove_special,
            truncate_limit=truncate_limit,
        )
    )

make_clean_names(expression, strip_underscores=None, case_type='lower', remove_special=False, strip_accents=False, enforce_string=False, truncate_limit=None)

Clean the labels in a polars Expression.

Examples:

>>> import polars as pl
>>> import janitor.polars
>>> df = pl.DataFrame({"raw": ["Abçdê fgí j"]})
>>> df
shape: (1, 1)
┌─────────────┐
│ raw         │
│ ---         │
│ str         │
╞═════════════╡
│ Abçdê fgí j │
└─────────────┘

Clean the column values:

>>> df.with_columns(pl.col("raw").make_clean_names(strip_accents=True))
shape: (1, 1)
┌─────────────┐
│ raw         │
│ ---         │
│ str         │
╞═════════════╡
│ abcde_fgi_j │
└─────────────┘

New in version 0.28.0

Parameters:

Name Type Description Default
strip_underscores str | bool

Removes the outer underscores from all labels in the expression. Default None keeps outer underscores. Values can be either 'left', 'right' or 'both' or the respective shorthand 'l', 'r' and True.

None
case_type str

Whether to make the labels in the expression lower or uppercase. Current case may be preserved with 'preserve', while snake case conversion (from CamelCase or camelCase only) can be turned on using "snake". Default 'lower' makes all characters lowercase.

'lower'
remove_special bool

Remove special characters from the values in the expression. Only letters, numbers and underscores are preserved.

False
strip_accents bool

Whether or not to remove accents from the expression.

False
enforce_string bool

Whether or not to cast the expression to a string type.

False
truncate_limit int

Truncates formatted labels in the expression to the specified length. Default None does not truncate.

None

Returns:

Type Description
Expr

A polars Expression.

Source code in janitor/polars/clean_names.py
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
@register_expr_method
def make_clean_names(
    expression,
    strip_underscores: str | bool = None,
    case_type: str = "lower",
    remove_special: bool = False,
    strip_accents: bool = False,
    enforce_string: bool = False,
    truncate_limit: int = None,
) -> pl.Expr:
    """
    Clean the labels in a polars Expression.

    Examples:
        >>> import polars as pl
        >>> import janitor.polars
        >>> df = pl.DataFrame({"raw": ["Abçdê fgí j"]})
        >>> df
        shape: (1, 1)
        ┌─────────────┐
        │ raw         │
        │ ---         │
        │ str         │
        ╞═════════════╡
        │ Abçdê fgí j │
        └─────────────┘

        Clean the column values:
        >>> df.with_columns(pl.col("raw").make_clean_names(strip_accents=True))
        shape: (1, 1)
        ┌─────────────┐
        │ raw         │
        │ ---         │
        │ str         │
        ╞═════════════╡
        │ abcde_fgi_j │
        └─────────────┘

    !!! info "New in version 0.28.0"

    Args:
        strip_underscores: Removes the outer underscores
            from all labels in the expression.
            Default None keeps outer underscores.
            Values can be either 'left', 'right'
            or 'both' or the respective shorthand 'l',
            'r' and True.
        case_type: Whether to make the labels in the expression lower or uppercase.
            Current case may be preserved with 'preserve',
            while snake case conversion (from CamelCase or camelCase only)
            can be turned on using "snake".
            Default 'lower' makes all characters lowercase.
        remove_special: Remove special characters from the values in the expression.
            Only letters, numbers and underscores are preserved.
        strip_accents: Whether or not to remove accents from
            the expression.
        enforce_string: Whether or not to cast the expression to a string type.
        truncate_limit: Truncates formatted labels in the expression to
            the specified length. Default None does not truncate.

    Returns:
        A polars Expression.
    """
    return _clean_expr_names(
        obj=expression,
        strip_accents=strip_accents,
        strip_underscores=strip_underscores,
        case_type=case_type,
        remove_special=remove_special,
        enforce_string=enforce_string,
        truncate_limit=truncate_limit,
    )

complete

complete implementation for polars.

complete(df, *columns, fill_value=None, explicit=True, sort=False, by=None)

Turns implicit missing values into explicit missing values

It is modeled after tidyr's complete function. In a way, it is the inverse of pl.drop_nulls, as it exposes implicitly missing rows.

If the combination involves multiple columns, pass it as a struct, with an alias - the name of the struct should not exist in the DataFrame.

If new values need to be introduced, a polars Expression with the new values can be passed, as long as the polars Expression has a name that already exists in the DataFrame.

It is up to the user to ensure that the polars expression returns unique values and/or sorted values.

Note that if the polars expression evaluates to a struct, then the fields, not the name, should already exist in the DataFrame.

complete can also be applied to a LazyFrame.

Examples:

>>> import polars as pl
>>> import janitor.polars
>>> df = pl.DataFrame(
...     dict(
...         group=(1, 2, 1, 2),
...         item_id=(1, 2, 2, 3),
...         item_name=("a", "a", "b", "b"),
...         value1=(1, None, 3, 4),
...         value2=range(4, 8),
...     )
... )
>>> df
shape: (4, 5)
┌───────┬─────────┬───────────┬────────┬────────┐
│ group ┆ item_id ┆ item_name ┆ value1 ┆ value2 │
│ ---   ┆ ---     ┆ ---       ┆ ---    ┆ ---    │
│ i64   ┆ i64     ┆ str       ┆ i64    ┆ i64    │
╞═══════╪═════════╪═══════════╪════════╪════════╡
│ 1     ┆ 1       ┆ a         ┆ 1      ┆ 4      │
│ 2     ┆ 2       ┆ a         ┆ null   ┆ 5      │
│ 1     ┆ 2       ┆ b         ┆ 3      ┆ 6      │
│ 2     ┆ 3       ┆ b         ┆ 4      ┆ 7      │
└───────┴─────────┴───────────┴────────┴────────┘

Generate all possible combinations of group, item_id, and item_name (whether or not they appear in the data)

>>> with pl.Config(tbl_rows=-1):
...     df.complete("group", "item_id", "item_name", sort=True)
shape: (12, 5)
┌───────┬─────────┬───────────┬────────┬────────┐
│ group ┆ item_id ┆ item_name ┆ value1 ┆ value2 │
│ ---   ┆ ---     ┆ ---       ┆ ---    ┆ ---    │
│ i64   ┆ i64     ┆ str       ┆ i64    ┆ i64    │
╞═══════╪═════════╪═══════════╪════════╪════════╡
│ 1     ┆ 1       ┆ a         ┆ 1      ┆ 4      │
│ 1     ┆ 1       ┆ b         ┆ null   ┆ null   │
│ 1     ┆ 2       ┆ a         ┆ null   ┆ null   │
│ 1     ┆ 2       ┆ b         ┆ 3      ┆ 6      │
│ 1     ┆ 3       ┆ a         ┆ null   ┆ null   │
│ 1     ┆ 3       ┆ b         ┆ null   ┆ null   │
│ 2     ┆ 1       ┆ a         ┆ null   ┆ null   │
│ 2     ┆ 1       ┆ b         ┆ null   ┆ null   │
│ 2     ┆ 2       ┆ a         ┆ null   ┆ 5      │
│ 2     ┆ 2       ┆ b         ┆ null   ┆ null   │
│ 2     ┆ 3       ┆ a         ┆ null   ┆ null   │
│ 2     ┆ 3       ┆ b         ┆ 4      ┆ 7      │
└───────┴─────────┴───────────┴────────┴────────┘

Cross all possible group values with the unique pairs of (item_id, item_name) that already exist in the data. For such situations, where there is a group of columns, pass it in as a struct:

>>> with pl.Config(tbl_rows=-1):
...     df.complete(
...         "group",
...         pl.struct("item_id", "item_name").unique().sort().alias("rar"),
...         sort=True
...     )
shape: (8, 5)
┌───────┬─────────┬───────────┬────────┬────────┐
│ group ┆ item_id ┆ item_name ┆ value1 ┆ value2 │
│ ---   ┆ ---     ┆ ---       ┆ ---    ┆ ---    │
│ i64   ┆ i64     ┆ str       ┆ i64    ┆ i64    │
╞═══════╪═════════╪═══════════╪════════╪════════╡
│ 1     ┆ 1       ┆ a         ┆ 1      ┆ 4      │
│ 1     ┆ 2       ┆ a         ┆ null   ┆ null   │
│ 1     ┆ 2       ┆ b         ┆ 3      ┆ 6      │
│ 1     ┆ 3       ┆ b         ┆ null   ┆ null   │
│ 2     ┆ 1       ┆ a         ┆ null   ┆ null   │
│ 2     ┆ 2       ┆ a         ┆ null   ┆ 5      │
│ 2     ┆ 2       ┆ b         ┆ null   ┆ null   │
│ 2     ┆ 3       ┆ b         ┆ 4      ┆ 7      │
└───────┴─────────┴───────────┴────────┴────────┘

Fill in nulls:

>>> with pl.Config(tbl_rows=-1):
...     df.complete(
...         "group",
...         pl.struct("item_id", "item_name").unique().sort().alias('rar'),
...         fill_value={"value1": 0, "value2": 99},
...         explicit=True,
...         sort=True,
...     )
shape: (8, 5)
┌───────┬─────────┬───────────┬────────┬────────┐
│ group ┆ item_id ┆ item_name ┆ value1 ┆ value2 │
│ ---   ┆ ---     ┆ ---       ┆ ---    ┆ ---    │
│ i64   ┆ i64     ┆ str       ┆ i64    ┆ i64    │
╞═══════╪═════════╪═══════════╪════════╪════════╡
│ 1     ┆ 1       ┆ a         ┆ 1      ┆ 4      │
│ 1     ┆ 2       ┆ a         ┆ 0      ┆ 99     │
│ 1     ┆ 2       ┆ b         ┆ 3      ┆ 6      │
│ 1     ┆ 3       ┆ b         ┆ 0      ┆ 99     │
│ 2     ┆ 1       ┆ a         ┆ 0      ┆ 99     │
│ 2     ┆ 2       ┆ a         ┆ 0      ┆ 5      │
│ 2     ┆ 2       ┆ b         ┆ 0      ┆ 99     │
│ 2     ┆ 3       ┆ b         ┆ 4      ┆ 7      │
└───────┴─────────┴───────────┴────────┴────────┘

Limit the fill to only the newly created missing values with explicit = FALSE

>>> with pl.Config(tbl_rows=-1):
...     df.complete(
...         "group",
...         pl.struct("item_id", "item_name").unique().sort().alias('rar'),
...         fill_value={"value1": 0, "value2": 99},
...         explicit=False,
...         sort=True,
...     )
shape: (8, 5)
┌───────┬─────────┬───────────┬────────┬────────┐
│ group ┆ item_id ┆ item_name ┆ value1 ┆ value2 │
│ ---   ┆ ---     ┆ ---       ┆ ---    ┆ ---    │
│ i64   ┆ i64     ┆ str       ┆ i64    ┆ i64    │
╞═══════╪═════════╪═══════════╪════════╪════════╡
│ 1     ┆ 1       ┆ a         ┆ 1      ┆ 4      │
│ 1     ┆ 2       ┆ a         ┆ 0      ┆ 99     │
│ 1     ┆ 2       ┆ b         ┆ 3      ┆ 6      │
│ 1     ┆ 3       ┆ b         ┆ 0      ┆ 99     │
│ 2     ┆ 1       ┆ a         ┆ 0      ┆ 99     │
│ 2     ┆ 2       ┆ a         ┆ null   ┆ 5      │
│ 2     ┆ 2       ┆ b         ┆ 0      ┆ 99     │
│ 2     ┆ 3       ┆ b         ┆ 4      ┆ 7      │
└───────┴─────────┴───────────┴────────┴────────┘
>>> df = pl.DataFrame(
...     {
...         "Year": [1999, 2000, 2004, 1999, 2004],
...         "Taxon": [
...             "Saccharina",
...             "Saccharina",
...             "Saccharina",
...             "Agarum",
...             "Agarum",
...         ],
...         "Abundance": [4, 5, 2, 1, 8],
...     }
... )
>>> df
shape: (5, 3)
┌──────┬────────────┬───────────┐
│ Year ┆ Taxon      ┆ Abundance │
│ ---  ┆ ---        ┆ ---       │
│ i64  ┆ str        ┆ i64       │
╞══════╪════════════╪═══════════╡
│ 1999 ┆ Saccharina ┆ 4         │
│ 2000 ┆ Saccharina ┆ 5         │
│ 2004 ┆ Saccharina ┆ 2         │
│ 1999 ┆ Agarum     ┆ 1         │
│ 2004 ┆ Agarum     ┆ 8         │
└──────┴────────────┴───────────┘

Expose missing years from 1999 to 2004 - pass a polars expression with the new dates, and ensure the expression's name already exists in the DataFrame:

>>> expression = pl.int_range(1999,2005).alias('Year')
>>> with pl.Config(tbl_rows=-1):
...     df.complete(expression,'Taxon',sort=True)
shape: (12, 3)
┌──────┬────────────┬───────────┐
│ Year ┆ Taxon      ┆ Abundance │
│ ---  ┆ ---        ┆ ---       │
│ i64  ┆ str        ┆ i64       │
╞══════╪════════════╪═══════════╡
│ 1999 ┆ Agarum     ┆ 1         │
│ 1999 ┆ Saccharina ┆ 4         │
│ 2000 ┆ Agarum     ┆ null      │
│ 2000 ┆ Saccharina ┆ 5         │
│ 2001 ┆ Agarum     ┆ null      │
│ 2001 ┆ Saccharina ┆ null      │
│ 2002 ┆ Agarum     ┆ null      │
│ 2002 ┆ Saccharina ┆ null      │
│ 2003 ┆ Agarum     ┆ null      │
│ 2003 ┆ Saccharina ┆ null      │
│ 2004 ┆ Agarum     ┆ 8         │
│ 2004 ┆ Saccharina ┆ 2         │
└──────┴────────────┴───────────┘

Expose missing rows per group:

>>> df = pl.DataFrame(
...     {
...         "state": ["CA", "CA", "HI", "HI", "HI", "NY", "NY"],
...         "year": [2010, 2013, 2010, 2012, 2016, 2009, 2013],
...         "value": [1, 3, 1, 2, 3, 2, 5],
...     }
... )
>>> df
shape: (7, 3)
┌───────┬──────┬───────┐
│ state ┆ year ┆ value │
│ ---   ┆ ---  ┆ ---   │
│ str   ┆ i64  ┆ i64   │
╞═══════╪══════╪═══════╡
│ CA    ┆ 2010 ┆ 1     │
│ CA    ┆ 2013 ┆ 3     │
│ HI    ┆ 2010 ┆ 1     │
│ HI    ┆ 2012 ┆ 2     │
│ HI    ┆ 2016 ┆ 3     │
│ NY    ┆ 2009 ┆ 2     │
│ NY    ┆ 2013 ┆ 5     │
└───────┴──────┴───────┘
>>> low = pl.col('year').min()
>>> high = pl.col('year').max().add(1)
>>> new_year_values=pl.int_range(low,high).alias('year')
>>> with pl.Config(tbl_rows=-1):
...     df.complete(new_year_values,by='state',sort=True)
shape: (16, 3)
┌───────┬──────┬───────┐
│ state ┆ year ┆ value │
│ ---   ┆ ---  ┆ ---   │
│ str   ┆ i64  ┆ i64   │
╞═══════╪══════╪═══════╡
│ CA    ┆ 2010 ┆ 1     │
│ CA    ┆ 2011 ┆ null  │
│ CA    ┆ 2012 ┆ null  │
│ CA    ┆ 2013 ┆ 3     │
│ HI    ┆ 2010 ┆ 1     │
│ HI    ┆ 2011 ┆ null  │
│ HI    ┆ 2012 ┆ 2     │
│ HI    ┆ 2013 ┆ null  │
│ HI    ┆ 2014 ┆ null  │
│ HI    ┆ 2015 ┆ null  │
│ HI    ┆ 2016 ┆ 3     │
│ NY    ┆ 2009 ┆ 2     │
│ NY    ┆ 2010 ┆ null  │
│ NY    ┆ 2011 ┆ null  │
│ NY    ┆ 2012 ┆ null  │
│ NY    ┆ 2013 ┆ 5     │
└───────┴──────┴───────┘

New in version 0.28.0

Parameters:

Name Type Description Default
*columns ColumnNameOrSelector

This refers to the columns to be completed. It can be a string or a column selector or a polars expression. A polars expression can be used to introduced new values, as long as the polars expression has a name that already exists in the DataFrame. It is up to the user to ensure that the polars expression returns unique values.

()
fill_value dict | Any | Expr

Scalar value or polars expression to use instead of nulls for missing combinations. A dictionary, mapping columns names to a scalar value is also accepted.

None
explicit bool

Determines if only implicitly missing values should be filled (False), or all nulls existing in the LazyFrame (True). explicit is applicable only if fill_value is not None.

True
sort bool

Sort the DataFrame based on *columns.

False
by ColumnNameOrSelector

Column(s) to group by. The explicit missing rows are returned per group.

None

Returns:

Type Description
DataFrame | LazyFrame

A polars DataFrame/LazyFrame.

Source code in janitor/polars/complete.py
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
@register_lazyframe_method
@register_dataframe_method
def complete(
    df: pl.DataFrame | pl.LazyFrame,
    *columns: ColumnNameOrSelector,
    fill_value: dict | Any | pl.Expr = None,
    explicit: bool = True,
    sort: bool = False,
    by: ColumnNameOrSelector = None,
) -> pl.DataFrame | pl.LazyFrame:
    """
    Turns implicit missing values into explicit missing values

    It is modeled after tidyr's `complete` function.
    In a way, it is the inverse of `pl.drop_nulls`,
    as it exposes implicitly missing rows.

    If the combination involves multiple columns, pass it as a struct,
    with an alias - the name of the struct should not exist in the DataFrame.

    If new values need to be introduced, a polars Expression
    with the new values can be passed, as long as the polars Expression
    has a name that already exists in the DataFrame.

    It is up to the user to ensure that the polars expression returns
    unique values and/or sorted values.

    Note that if the polars expression evaluates to a struct,
    then the fields, not the name, should already exist in the DataFrame.

    `complete` can also be applied to a LazyFrame.

    Examples:
        >>> import polars as pl
        >>> import janitor.polars
        >>> df = pl.DataFrame(
        ...     dict(
        ...         group=(1, 2, 1, 2),
        ...         item_id=(1, 2, 2, 3),
        ...         item_name=("a", "a", "b", "b"),
        ...         value1=(1, None, 3, 4),
        ...         value2=range(4, 8),
        ...     )
        ... )
        >>> df
        shape: (4, 5)
        ┌───────┬─────────┬───────────┬────────┬────────┐
        │ group ┆ item_id ┆ item_name ┆ value1 ┆ value2 │
        │ ---   ┆ ---     ┆ ---       ┆ ---    ┆ ---    │
        │ i64   ┆ i64     ┆ str       ┆ i64    ┆ i64    │
        ╞═══════╪═════════╪═══════════╪════════╪════════╡
        │ 1     ┆ 1       ┆ a         ┆ 1      ┆ 4      │
        │ 2     ┆ 2       ┆ a         ┆ null   ┆ 5      │
        │ 1     ┆ 2       ┆ b         ┆ 3      ┆ 6      │
        │ 2     ┆ 3       ┆ b         ┆ 4      ┆ 7      │
        └───────┴─────────┴───────────┴────────┴────────┘

        Generate all possible combinations of
        `group`, `item_id`, and `item_name`
        (whether or not they appear in the data)
        >>> with pl.Config(tbl_rows=-1):
        ...     df.complete("group", "item_id", "item_name", sort=True)
        shape: (12, 5)
        ┌───────┬─────────┬───────────┬────────┬────────┐
        │ group ┆ item_id ┆ item_name ┆ value1 ┆ value2 │
        │ ---   ┆ ---     ┆ ---       ┆ ---    ┆ ---    │
        │ i64   ┆ i64     ┆ str       ┆ i64    ┆ i64    │
        ╞═══════╪═════════╪═══════════╪════════╪════════╡
        │ 1     ┆ 1       ┆ a         ┆ 1      ┆ 4      │
        │ 1     ┆ 1       ┆ b         ┆ null   ┆ null   │
        │ 1     ┆ 2       ┆ a         ┆ null   ┆ null   │
        │ 1     ┆ 2       ┆ b         ┆ 3      ┆ 6      │
        │ 1     ┆ 3       ┆ a         ┆ null   ┆ null   │
        │ 1     ┆ 3       ┆ b         ┆ null   ┆ null   │
        │ 2     ┆ 1       ┆ a         ┆ null   ┆ null   │
        │ 2     ┆ 1       ┆ b         ┆ null   ┆ null   │
        │ 2     ┆ 2       ┆ a         ┆ null   ┆ 5      │
        │ 2     ┆ 2       ┆ b         ┆ null   ┆ null   │
        │ 2     ┆ 3       ┆ a         ┆ null   ┆ null   │
        │ 2     ┆ 3       ┆ b         ┆ 4      ┆ 7      │
        └───────┴─────────┴───────────┴────────┴────────┘

        Cross all possible `group` values with the unique pairs of
        `(item_id, item_name)` that already exist in the data.
        For such situations, where there is a group of columns,
        pass it in as a struct:
        >>> with pl.Config(tbl_rows=-1):
        ...     df.complete(
        ...         "group",
        ...         pl.struct("item_id", "item_name").unique().sort().alias("rar"),
        ...         sort=True
        ...     )
        shape: (8, 5)
        ┌───────┬─────────┬───────────┬────────┬────────┐
        │ group ┆ item_id ┆ item_name ┆ value1 ┆ value2 │
        │ ---   ┆ ---     ┆ ---       ┆ ---    ┆ ---    │
        │ i64   ┆ i64     ┆ str       ┆ i64    ┆ i64    │
        ╞═══════╪═════════╪═══════════╪════════╪════════╡
        │ 1     ┆ 1       ┆ a         ┆ 1      ┆ 4      │
        │ 1     ┆ 2       ┆ a         ┆ null   ┆ null   │
        │ 1     ┆ 2       ┆ b         ┆ 3      ┆ 6      │
        │ 1     ┆ 3       ┆ b         ┆ null   ┆ null   │
        │ 2     ┆ 1       ┆ a         ┆ null   ┆ null   │
        │ 2     ┆ 2       ┆ a         ┆ null   ┆ 5      │
        │ 2     ┆ 2       ┆ b         ┆ null   ┆ null   │
        │ 2     ┆ 3       ┆ b         ┆ 4      ┆ 7      │
        └───────┴─────────┴───────────┴────────┴────────┘

        Fill in nulls:
        >>> with pl.Config(tbl_rows=-1):
        ...     df.complete(
        ...         "group",
        ...         pl.struct("item_id", "item_name").unique().sort().alias('rar'),
        ...         fill_value={"value1": 0, "value2": 99},
        ...         explicit=True,
        ...         sort=True,
        ...     )
        shape: (8, 5)
        ┌───────┬─────────┬───────────┬────────┬────────┐
        │ group ┆ item_id ┆ item_name ┆ value1 ┆ value2 │
        │ ---   ┆ ---     ┆ ---       ┆ ---    ┆ ---    │
        │ i64   ┆ i64     ┆ str       ┆ i64    ┆ i64    │
        ╞═══════╪═════════╪═══════════╪════════╪════════╡
        │ 1     ┆ 1       ┆ a         ┆ 1      ┆ 4      │
        │ 1     ┆ 2       ┆ a         ┆ 0      ┆ 99     │
        │ 1     ┆ 2       ┆ b         ┆ 3      ┆ 6      │
        │ 1     ┆ 3       ┆ b         ┆ 0      ┆ 99     │
        │ 2     ┆ 1       ┆ a         ┆ 0      ┆ 99     │
        │ 2     ┆ 2       ┆ a         ┆ 0      ┆ 5      │
        │ 2     ┆ 2       ┆ b         ┆ 0      ┆ 99     │
        │ 2     ┆ 3       ┆ b         ┆ 4      ┆ 7      │
        └───────┴─────────┴───────────┴────────┴────────┘

        Limit the fill to only the newly created
        missing values with `explicit = FALSE`
        >>> with pl.Config(tbl_rows=-1):
        ...     df.complete(
        ...         "group",
        ...         pl.struct("item_id", "item_name").unique().sort().alias('rar'),
        ...         fill_value={"value1": 0, "value2": 99},
        ...         explicit=False,
        ...         sort=True,
        ...     )
        shape: (8, 5)
        ┌───────┬─────────┬───────────┬────────┬────────┐
        │ group ┆ item_id ┆ item_name ┆ value1 ┆ value2 │
        │ ---   ┆ ---     ┆ ---       ┆ ---    ┆ ---    │
        │ i64   ┆ i64     ┆ str       ┆ i64    ┆ i64    │
        ╞═══════╪═════════╪═══════════╪════════╪════════╡
        │ 1     ┆ 1       ┆ a         ┆ 1      ┆ 4      │
        │ 1     ┆ 2       ┆ a         ┆ 0      ┆ 99     │
        │ 1     ┆ 2       ┆ b         ┆ 3      ┆ 6      │
        │ 1     ┆ 3       ┆ b         ┆ 0      ┆ 99     │
        │ 2     ┆ 1       ┆ a         ┆ 0      ┆ 99     │
        │ 2     ┆ 2       ┆ a         ┆ null   ┆ 5      │
        │ 2     ┆ 2       ┆ b         ┆ 0      ┆ 99     │
        │ 2     ┆ 3       ┆ b         ┆ 4      ┆ 7      │
        └───────┴─────────┴───────────┴────────┴────────┘

        >>> df = pl.DataFrame(
        ...     {
        ...         "Year": [1999, 2000, 2004, 1999, 2004],
        ...         "Taxon": [
        ...             "Saccharina",
        ...             "Saccharina",
        ...             "Saccharina",
        ...             "Agarum",
        ...             "Agarum",
        ...         ],
        ...         "Abundance": [4, 5, 2, 1, 8],
        ...     }
        ... )
        >>> df
        shape: (5, 3)
        ┌──────┬────────────┬───────────┐
        │ Year ┆ Taxon      ┆ Abundance │
        │ ---  ┆ ---        ┆ ---       │
        │ i64  ┆ str        ┆ i64       │
        ╞══════╪════════════╪═══════════╡
        │ 1999 ┆ Saccharina ┆ 4         │
        │ 2000 ┆ Saccharina ┆ 5         │
        │ 2004 ┆ Saccharina ┆ 2         │
        │ 1999 ┆ Agarum     ┆ 1         │
        │ 2004 ┆ Agarum     ┆ 8         │
        └──────┴────────────┴───────────┘

        Expose missing years from 1999 to 2004 -
        pass a polars expression with the new dates,
        and ensure the expression's name already exists
        in the DataFrame:
        >>> expression = pl.int_range(1999,2005).alias('Year')
        >>> with pl.Config(tbl_rows=-1):
        ...     df.complete(expression,'Taxon',sort=True)
        shape: (12, 3)
        ┌──────┬────────────┬───────────┐
        │ Year ┆ Taxon      ┆ Abundance │
        │ ---  ┆ ---        ┆ ---       │
        │ i64  ┆ str        ┆ i64       │
        ╞══════╪════════════╪═══════════╡
        │ 1999 ┆ Agarum     ┆ 1         │
        │ 1999 ┆ Saccharina ┆ 4         │
        │ 2000 ┆ Agarum     ┆ null      │
        │ 2000 ┆ Saccharina ┆ 5         │
        │ 2001 ┆ Agarum     ┆ null      │
        │ 2001 ┆ Saccharina ┆ null      │
        │ 2002 ┆ Agarum     ┆ null      │
        │ 2002 ┆ Saccharina ┆ null      │
        │ 2003 ┆ Agarum     ┆ null      │
        │ 2003 ┆ Saccharina ┆ null      │
        │ 2004 ┆ Agarum     ┆ 8         │
        │ 2004 ┆ Saccharina ┆ 2         │
        └──────┴────────────┴───────────┘

        Expose missing rows per group:
        >>> df = pl.DataFrame(
        ...     {
        ...         "state": ["CA", "CA", "HI", "HI", "HI", "NY", "NY"],
        ...         "year": [2010, 2013, 2010, 2012, 2016, 2009, 2013],
        ...         "value": [1, 3, 1, 2, 3, 2, 5],
        ...     }
        ... )
        >>> df
        shape: (7, 3)
        ┌───────┬──────┬───────┐
        │ state ┆ year ┆ value │
        │ ---   ┆ ---  ┆ ---   │
        │ str   ┆ i64  ┆ i64   │
        ╞═══════╪══════╪═══════╡
        │ CA    ┆ 2010 ┆ 1     │
        │ CA    ┆ 2013 ┆ 3     │
        │ HI    ┆ 2010 ┆ 1     │
        │ HI    ┆ 2012 ┆ 2     │
        │ HI    ┆ 2016 ┆ 3     │
        │ NY    ┆ 2009 ┆ 2     │
        │ NY    ┆ 2013 ┆ 5     │
        └───────┴──────┴───────┘
        >>> low = pl.col('year').min()
        >>> high = pl.col('year').max().add(1)
        >>> new_year_values=pl.int_range(low,high).alias('year')
        >>> with pl.Config(tbl_rows=-1):
        ...     df.complete(new_year_values,by='state',sort=True)
        shape: (16, 3)
        ┌───────┬──────┬───────┐
        │ state ┆ year ┆ value │
        │ ---   ┆ ---  ┆ ---   │
        │ str   ┆ i64  ┆ i64   │
        ╞═══════╪══════╪═══════╡
        │ CA    ┆ 2010 ┆ 1     │
        │ CA    ┆ 2011 ┆ null  │
        │ CA    ┆ 2012 ┆ null  │
        │ CA    ┆ 2013 ┆ 3     │
        │ HI    ┆ 2010 ┆ 1     │
        │ HI    ┆ 2011 ┆ null  │
        │ HI    ┆ 2012 ┆ 2     │
        │ HI    ┆ 2013 ┆ null  │
        │ HI    ┆ 2014 ┆ null  │
        │ HI    ┆ 2015 ┆ null  │
        │ HI    ┆ 2016 ┆ 3     │
        │ NY    ┆ 2009 ┆ 2     │
        │ NY    ┆ 2010 ┆ null  │
        │ NY    ┆ 2011 ┆ null  │
        │ NY    ┆ 2012 ┆ null  │
        │ NY    ┆ 2013 ┆ 5     │
        └───────┴──────┴───────┘


    !!! info "New in version 0.28.0"

    Args:
        *columns: This refers to the columns to be completed.
            It can be a string or a column selector or a polars expression.
            A polars expression can be used to introduced new values,
            as long as the polars expression has a name that already exists
            in the DataFrame.
            It is up to the user to ensure that the polars expression returns
            unique values.
        fill_value: Scalar value or polars expression to use instead of nulls
            for missing combinations. A dictionary, mapping columns names
            to a scalar value is also accepted.
        explicit: Determines if only implicitly missing values
            should be filled (`False`), or all nulls existing in the LazyFrame
            (`True`). `explicit` is applicable only
            if `fill_value` is not `None`.
        sort: Sort the DataFrame based on *columns.
        by: Column(s) to group by.
            The explicit missing rows are returned per group.

    Returns:
        A polars DataFrame/LazyFrame.
    """  # noqa: E501
    return _complete(
        df=df,
        columns=columns,
        fill_value=fill_value,
        explicit=explicit,
        sort=sort,
        by=by,
    )

pivot_longer

pivot_longer implementation for polars.

pivot_longer(df, index=None, column_names=None, names_to='variable', values_to='value', names_sep=None, names_pattern=None, names_transform=None)

Unpivots a DataFrame from wide to long format.

It is modeled after the pivot_longer function in R's tidyr package, and also takes inspiration from the melt function in R's data.table package.

This function is useful to massage a DataFrame into a format where one or more columns are considered measured variables, and all other columns are considered as identifier variables.

All measured variables are unpivoted (and typically duplicated) along the row axis.

If names_pattern, use a valid regular expression pattern containing at least one capture group, compatible with the regex crate.

For more granular control on the unpivoting, have a look at pivot_longer_spec.

pivot_longer can also be applied to a LazyFrame.

Examples:

>>> import polars as pl
>>> import polars.selectors as cs
>>> import janitor.polars
>>> df = pl.DataFrame(
...     {
...         "Sepal.Length": [5.1, 5.9],
...         "Sepal.Width": [3.5, 3.0],
...         "Petal.Length": [1.4, 5.1],
...         "Petal.Width": [0.2, 1.8],
...         "Species": ["setosa", "virginica"],
...     }
... )
>>> df
shape: (2, 5)
┌──────────────┬─────────────┬──────────────┬─────────────┬───────────┐
│ Sepal.Length ┆ Sepal.Width ┆ Petal.Length ┆ Petal.Width ┆ Species   │
│ ---          ┆ ---         ┆ ---          ┆ ---         ┆ ---       │
│ f64          ┆ f64         ┆ f64          ┆ f64         ┆ str       │
╞══════════════╪═════════════╪══════════════╪═════════════╪═══════════╡
│ 5.1          ┆ 3.5         ┆ 1.4          ┆ 0.2         ┆ setosa    │
│ 5.9          ┆ 3.0         ┆ 5.1          ┆ 1.8         ┆ virginica │
└──────────────┴─────────────┴──────────────┴─────────────┴───────────┘

Replicate polars' melt:

>>> df.pivot_longer(index = 'Species').sort(by=pl.all())
shape: (8, 3)
┌───────────┬──────────────┬───────┐
│ Species   ┆ variable     ┆ value │
│ ---       ┆ ---          ┆ ---   │
│ str       ┆ str          ┆ f64   │
╞═══════════╪══════════════╪═══════╡
│ setosa    ┆ Petal.Length ┆ 1.4   │
│ setosa    ┆ Petal.Width  ┆ 0.2   │
│ setosa    ┆ Sepal.Length ┆ 5.1   │
│ setosa    ┆ Sepal.Width  ┆ 3.5   │
│ virginica ┆ Petal.Length ┆ 5.1   │
│ virginica ┆ Petal.Width  ┆ 1.8   │
│ virginica ┆ Sepal.Length ┆ 5.9   │
│ virginica ┆ Sepal.Width  ┆ 3.0   │
└───────────┴──────────────┴───────┘

Split the column labels into individual columns:

>>> df.pivot_longer(
...     index = 'Species',
...     names_to = ('part', 'dimension'),
...     names_sep = '.',
... ).select('Species','part','dimension','value').sort(by=pl.all())
shape: (8, 4)
┌───────────┬───────┬───────────┬───────┐
│ Species   ┆ part  ┆ dimension ┆ value │
│ ---       ┆ ---   ┆ ---       ┆ ---   │
│ str       ┆ str   ┆ str       ┆ f64   │
╞═══════════╪═══════╪═══════════╪═══════╡
│ setosa    ┆ Petal ┆ Length    ┆ 1.4   │
│ setosa    ┆ Petal ┆ Width     ┆ 0.2   │
│ setosa    ┆ Sepal ┆ Length    ┆ 5.1   │
│ setosa    ┆ Sepal ┆ Width     ┆ 3.5   │
│ virginica ┆ Petal ┆ Length    ┆ 5.1   │
│ virginica ┆ Petal ┆ Width     ┆ 1.8   │
│ virginica ┆ Sepal ┆ Length    ┆ 5.9   │
│ virginica ┆ Sepal ┆ Width     ┆ 3.0   │
└───────────┴───────┴───────────┴───────┘

Retain parts of the column names as headers:

>>> df.pivot_longer(
...     index = 'Species',
...     names_to = ('part', '.value'),
...     names_sep = '.',
... ).select('Species','part','Length','Width').sort(by=pl.all())
shape: (4, 4)
┌───────────┬───────┬────────┬───────┐
│ Species   ┆ part  ┆ Length ┆ Width │
│ ---       ┆ ---   ┆ ---    ┆ ---   │
│ str       ┆ str   ┆ f64    ┆ f64   │
╞═══════════╪═══════╪════════╪═══════╡
│ setosa    ┆ Petal ┆ 1.4    ┆ 0.2   │
│ setosa    ┆ Sepal ┆ 5.1    ┆ 3.5   │
│ virginica ┆ Petal ┆ 5.1    ┆ 1.8   │
│ virginica ┆ Sepal ┆ 5.9    ┆ 3.0   │
└───────────┴───────┴────────┴───────┘

Split the column labels based on regex:

>>> df = pl.DataFrame({"id": [1], "new_sp_m5564": [2], "newrel_f65": [3]})
>>> df
shape: (1, 3)
┌─────┬──────────────┬────────────┐
│ id  ┆ new_sp_m5564 ┆ newrel_f65 │
│ --- ┆ ---          ┆ ---        │
│ i64 ┆ i64          ┆ i64        │
╞═════╪══════════════╪════════════╡
│ 1   ┆ 2            ┆ 3          │
└─────┴──────────────┴────────────┘
>>> df.pivot_longer(
...     index = 'id',
...     names_to = ('diagnosis', 'gender', 'age'),
...     names_pattern = r"new_?(.+)_(.)([0-9]+)",
... ).select('id','diagnosis','gender','age','value').sort(by=pl.all())
shape: (2, 5)
┌─────┬───────────┬────────┬──────┬───────┐
│ id  ┆ diagnosis ┆ gender ┆ age  ┆ value │
│ --- ┆ ---       ┆ ---    ┆ ---  ┆ ---   │
│ i64 ┆ str       ┆ str    ┆ str  ┆ i64   │
╞═════╪═══════════╪════════╪══════╪═══════╡
│ 1   ┆ rel       ┆ f      ┆ 65   ┆ 3     │
│ 1   ┆ sp        ┆ m      ┆ 5564 ┆ 2     │
└─────┴───────────┴────────┴──────┴───────┘

Convert the dtypes of specific columns with names_transform:

>>> df.pivot_longer(
...     index = "id",
...     names_pattern=r"new_?(.+)_(.)([0-9]+)",
...     names_to=("diagnosis", "gender", "age"),
...     names_transform=pl.col('age').cast(pl.Int32),
... ).select("id", "diagnosis", "gender", "age", "value").sort(by=pl.all())
shape: (2, 5)
┌─────┬───────────┬────────┬──────┬───────┐
│ id  ┆ diagnosis ┆ gender ┆ age  ┆ value │
│ --- ┆ ---       ┆ ---    ┆ ---  ┆ ---   │
│ i64 ┆ str       ┆ str    ┆ i32  ┆ i64   │
╞═════╪═══════════╪════════╪══════╪═══════╡
│ 1   ┆ rel       ┆ f      ┆ 65   ┆ 3     │
│ 1   ┆ sp        ┆ m      ┆ 5564 ┆ 2     │
└─────┴───────────┴────────┴──────┴───────┘

Use multiple .value to reshape the dataframe:

>>> df = pl.DataFrame(
...     [
...         {
...             "x_1_mean": 10,
...             "x_2_mean": 20,
...             "y_1_mean": 30,
...             "y_2_mean": 40,
...             "unit": 50,
...         }
...     ]
... )
>>> df
shape: (1, 5)
┌──────────┬──────────┬──────────┬──────────┬──────┐
│ x_1_mean ┆ x_2_mean ┆ y_1_mean ┆ y_2_mean ┆ unit │
│ ---      ┆ ---      ┆ ---      ┆ ---      ┆ ---  │
│ i64      ┆ i64      ┆ i64      ┆ i64      ┆ i64  │
╞══════════╪══════════╪══════════╪══════════╪══════╡
│ 10       ┆ 20       ┆ 30       ┆ 40       ┆ 50   │
└──────────┴──────────┴──────────┴──────────┴──────┘
>>> df.pivot_longer(
...     index="unit",
...     names_to=(".value", "time", ".value"),
...     names_pattern=r"(x|y)_([0-9])(_mean)",
... ).select('unit','time','x_mean','y_mean').sort(by=pl.all())
shape: (2, 4)
┌──────┬──────┬────────┬────────┐
│ unit ┆ time ┆ x_mean ┆ y_mean │
│ ---  ┆ ---  ┆ ---    ┆ ---    │
│ i64  ┆ str  ┆ i64    ┆ i64    │
╞══════╪══════╪════════╪════════╡
│ 50   ┆ 1    ┆ 10     ┆ 30     │
│ 50   ┆ 2    ┆ 20     ┆ 40     │
└──────┴──────┴────────┴────────┘

New in version 0.28.0

Parameters:

Name Type Description Default
index ColumnNameOrSelector

Column(s) or selector(s) to use as identifier variables.

None
column_names ColumnNameOrSelector

Column(s) or selector(s) to unpivot.

None
names_to list | tuple | str

Name of new column as a string that will contain what were previously the column names in column_names. The default is variable if no value is provided. It can also be a list/tuple of strings that will serve as new column names, if name_sep or names_pattern is provided. If .value is in names_to, new column names will be extracted from part of the existing column names and overrides values_to.

'variable'
values_to str

Name of new column as a string that will contain what were previously the values of the columns in column_names.

'value'
names_sep str

Determines how the column name is broken up, if names_to contains multiple values. It takes the same specification as polars' str.split method.

None
names_pattern str

Determines how the column name is broken up. It can be a regular expression containing matching groups. It takes the same specification as polars' str.extract_groups method.

None
names_transform Expr

Use this option to change the types of columns that have been transformed to rows. This does not applies to the values' columns. Accepts a polars expression or a list of polars expressions. Applicable only if one of names_sep or names_pattern is provided.

None

Returns:

Type Description
DataFrame | LazyFrame

A polars DataFrame/LazyFrame that has been unpivoted

DataFrame | LazyFrame

from wide to long format.

Source code in janitor/polars/pivot_longer.py
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
@register_lazyframe_method
@register_dataframe_method
def pivot_longer(
    df: pl.DataFrame | pl.LazyFrame,
    index: ColumnNameOrSelector = None,
    column_names: ColumnNameOrSelector = None,
    names_to: list | tuple | str = "variable",
    values_to: str = "value",
    names_sep: str = None,
    names_pattern: str = None,
    names_transform: pl.Expr = None,
) -> pl.DataFrame | pl.LazyFrame:
    """
    Unpivots a DataFrame from *wide* to *long* format.

    It is modeled after the `pivot_longer` function in R's tidyr package,
    and also takes inspiration from the `melt` function in R's data.table package.

    This function is useful to massage a DataFrame into a format where
    one or more columns are considered measured variables, and all other
    columns are considered as identifier variables.

    All measured variables are *unpivoted* (and typically duplicated) along the
    row axis.

    If `names_pattern`, use a valid regular expression pattern containing at least
    one capture group, compatible with the [regex crate](https://docs.rs/regex/latest/regex/).

    For more granular control on the unpivoting, have a look at
    [`pivot_longer_spec`][janitor.polars.pivot_longer.pivot_longer_spec].

    `pivot_longer` can also be applied to a LazyFrame.

    Examples:
        >>> import polars as pl
        >>> import polars.selectors as cs
        >>> import janitor.polars
        >>> df = pl.DataFrame(
        ...     {
        ...         "Sepal.Length": [5.1, 5.9],
        ...         "Sepal.Width": [3.5, 3.0],
        ...         "Petal.Length": [1.4, 5.1],
        ...         "Petal.Width": [0.2, 1.8],
        ...         "Species": ["setosa", "virginica"],
        ...     }
        ... )
        >>> df
        shape: (2, 5)
        ┌──────────────┬─────────────┬──────────────┬─────────────┬───────────┐
        │ Sepal.Length ┆ Sepal.Width ┆ Petal.Length ┆ Petal.Width ┆ Species   │
        │ ---          ┆ ---         ┆ ---          ┆ ---         ┆ ---       │
        │ f64          ┆ f64         ┆ f64          ┆ f64         ┆ str       │
        ╞══════════════╪═════════════╪══════════════╪═════════════╪═══════════╡
        │ 5.1          ┆ 3.5         ┆ 1.4          ┆ 0.2         ┆ setosa    │
        │ 5.9          ┆ 3.0         ┆ 5.1          ┆ 1.8         ┆ virginica │
        └──────────────┴─────────────┴──────────────┴─────────────┴───────────┘

        Replicate polars' [melt](https://docs.pola.rs/py-polars/html/reference/dataframe/api/polars.DataFrame.melt.html#polars-dataframe-melt):
        >>> df.pivot_longer(index = 'Species').sort(by=pl.all())
        shape: (8, 3)
        ┌───────────┬──────────────┬───────┐
        │ Species   ┆ variable     ┆ value │
        │ ---       ┆ ---          ┆ ---   │
        │ str       ┆ str          ┆ f64   │
        ╞═══════════╪══════════════╪═══════╡
        │ setosa    ┆ Petal.Length ┆ 1.4   │
        │ setosa    ┆ Petal.Width  ┆ 0.2   │
        │ setosa    ┆ Sepal.Length ┆ 5.1   │
        │ setosa    ┆ Sepal.Width  ┆ 3.5   │
        │ virginica ┆ Petal.Length ┆ 5.1   │
        │ virginica ┆ Petal.Width  ┆ 1.8   │
        │ virginica ┆ Sepal.Length ┆ 5.9   │
        │ virginica ┆ Sepal.Width  ┆ 3.0   │
        └───────────┴──────────────┴───────┘

        Split the column labels into individual columns:
        >>> df.pivot_longer(
        ...     index = 'Species',
        ...     names_to = ('part', 'dimension'),
        ...     names_sep = '.',
        ... ).select('Species','part','dimension','value').sort(by=pl.all())
        shape: (8, 4)
        ┌───────────┬───────┬───────────┬───────┐
        │ Species   ┆ part  ┆ dimension ┆ value │
        │ ---       ┆ ---   ┆ ---       ┆ ---   │
        │ str       ┆ str   ┆ str       ┆ f64   │
        ╞═══════════╪═══════╪═══════════╪═══════╡
        │ setosa    ┆ Petal ┆ Length    ┆ 1.4   │
        │ setosa    ┆ Petal ┆ Width     ┆ 0.2   │
        │ setosa    ┆ Sepal ┆ Length    ┆ 5.1   │
        │ setosa    ┆ Sepal ┆ Width     ┆ 3.5   │
        │ virginica ┆ Petal ┆ Length    ┆ 5.1   │
        │ virginica ┆ Petal ┆ Width     ┆ 1.8   │
        │ virginica ┆ Sepal ┆ Length    ┆ 5.9   │
        │ virginica ┆ Sepal ┆ Width     ┆ 3.0   │
        └───────────┴───────┴───────────┴───────┘

        Retain parts of the column names as headers:
        >>> df.pivot_longer(
        ...     index = 'Species',
        ...     names_to = ('part', '.value'),
        ...     names_sep = '.',
        ... ).select('Species','part','Length','Width').sort(by=pl.all())
        shape: (4, 4)
        ┌───────────┬───────┬────────┬───────┐
        │ Species   ┆ part  ┆ Length ┆ Width │
        │ ---       ┆ ---   ┆ ---    ┆ ---   │
        │ str       ┆ str   ┆ f64    ┆ f64   │
        ╞═══════════╪═══════╪════════╪═══════╡
        │ setosa    ┆ Petal ┆ 1.4    ┆ 0.2   │
        │ setosa    ┆ Sepal ┆ 5.1    ┆ 3.5   │
        │ virginica ┆ Petal ┆ 5.1    ┆ 1.8   │
        │ virginica ┆ Sepal ┆ 5.9    ┆ 3.0   │
        └───────────┴───────┴────────┴───────┘

        Split the column labels based on regex:
        >>> df = pl.DataFrame({"id": [1], "new_sp_m5564": [2], "newrel_f65": [3]})
        >>> df
        shape: (1, 3)
        ┌─────┬──────────────┬────────────┐
        │ id  ┆ new_sp_m5564 ┆ newrel_f65 │
        │ --- ┆ ---          ┆ ---        │
        │ i64 ┆ i64          ┆ i64        │
        ╞═════╪══════════════╪════════════╡
        │ 1   ┆ 2            ┆ 3          │
        └─────┴──────────────┴────────────┘
        >>> df.pivot_longer(
        ...     index = 'id',
        ...     names_to = ('diagnosis', 'gender', 'age'),
        ...     names_pattern = r"new_?(.+)_(.)([0-9]+)",
        ... ).select('id','diagnosis','gender','age','value').sort(by=pl.all())
        shape: (2, 5)
        ┌─────┬───────────┬────────┬──────┬───────┐
        │ id  ┆ diagnosis ┆ gender ┆ age  ┆ value │
        │ --- ┆ ---       ┆ ---    ┆ ---  ┆ ---   │
        │ i64 ┆ str       ┆ str    ┆ str  ┆ i64   │
        ╞═════╪═══════════╪════════╪══════╪═══════╡
        │ 1   ┆ rel       ┆ f      ┆ 65   ┆ 3     │
        │ 1   ┆ sp        ┆ m      ┆ 5564 ┆ 2     │
        └─────┴───────────┴────────┴──────┴───────┘

        Convert the dtypes of specific columns with `names_transform`:
        >>> df.pivot_longer(
        ...     index = "id",
        ...     names_pattern=r"new_?(.+)_(.)([0-9]+)",
        ...     names_to=("diagnosis", "gender", "age"),
        ...     names_transform=pl.col('age').cast(pl.Int32),
        ... ).select("id", "diagnosis", "gender", "age", "value").sort(by=pl.all())
        shape: (2, 5)
        ┌─────┬───────────┬────────┬──────┬───────┐
        │ id  ┆ diagnosis ┆ gender ┆ age  ┆ value │
        │ --- ┆ ---       ┆ ---    ┆ ---  ┆ ---   │
        │ i64 ┆ str       ┆ str    ┆ i32  ┆ i64   │
        ╞═════╪═══════════╪════════╪══════╪═══════╡
        │ 1   ┆ rel       ┆ f      ┆ 65   ┆ 3     │
        │ 1   ┆ sp        ┆ m      ┆ 5564 ┆ 2     │
        └─────┴───────────┴────────┴──────┴───────┘

        Use multiple `.value` to reshape the dataframe:
        >>> df = pl.DataFrame(
        ...     [
        ...         {
        ...             "x_1_mean": 10,
        ...             "x_2_mean": 20,
        ...             "y_1_mean": 30,
        ...             "y_2_mean": 40,
        ...             "unit": 50,
        ...         }
        ...     ]
        ... )
        >>> df
        shape: (1, 5)
        ┌──────────┬──────────┬──────────┬──────────┬──────┐
        │ x_1_mean ┆ x_2_mean ┆ y_1_mean ┆ y_2_mean ┆ unit │
        │ ---      ┆ ---      ┆ ---      ┆ ---      ┆ ---  │
        │ i64      ┆ i64      ┆ i64      ┆ i64      ┆ i64  │
        ╞══════════╪══════════╪══════════╪══════════╪══════╡
        │ 10       ┆ 20       ┆ 30       ┆ 40       ┆ 50   │
        └──────────┴──────────┴──────────┴──────────┴──────┘
        >>> df.pivot_longer(
        ...     index="unit",
        ...     names_to=(".value", "time", ".value"),
        ...     names_pattern=r"(x|y)_([0-9])(_mean)",
        ... ).select('unit','time','x_mean','y_mean').sort(by=pl.all())
        shape: (2, 4)
        ┌──────┬──────┬────────┬────────┐
        │ unit ┆ time ┆ x_mean ┆ y_mean │
        │ ---  ┆ ---  ┆ ---    ┆ ---    │
        │ i64  ┆ str  ┆ i64    ┆ i64    │
        ╞══════╪══════╪════════╪════════╡
        │ 50   ┆ 1    ┆ 10     ┆ 30     │
        │ 50   ┆ 2    ┆ 20     ┆ 40     │
        └──────┴──────┴────────┴────────┘

    !!! info "New in version 0.28.0"

    Args:
        index: Column(s) or selector(s) to use as identifier variables.
        column_names: Column(s) or selector(s) to unpivot.
        names_to: Name of new column as a string that will contain
            what were previously the column names in `column_names`.
            The default is `variable` if no value is provided. It can
            also be a list/tuple of strings that will serve as new column
            names, if `name_sep` or `names_pattern` is provided.
            If `.value` is in `names_to`, new column names will be extracted
            from part of the existing column names and overrides `values_to`.
        values_to: Name of new column as a string that will contain what
            were previously the values of the columns in `column_names`.
        names_sep: Determines how the column name is broken up, if
            `names_to` contains multiple values. It takes the same
            specification as polars' `str.split` method.
        names_pattern: Determines how the column name is broken up.
            It can be a regular expression containing matching groups.
            It takes the same
            specification as polars' `str.extract_groups` method.
        names_transform: Use this option to change the types of columns that
            have been transformed to rows.
            This does not applies to the values' columns.
            Accepts a polars expression or a list of polars expressions.
            Applicable only if one of names_sep
            or names_pattern is provided.

    Returns:
        A polars DataFrame/LazyFrame that has been unpivoted
        from wide to long format.
    """  # noqa: E501
    return _pivot_longer(
        df=df,
        index=index,
        column_names=column_names,
        names_pattern=names_pattern,
        names_sep=names_sep,
        names_to=names_to,
        values_to=values_to,
        names_transform=names_transform,
    )

pivot_longer_spec(df, spec)

A declarative interface to pivot a Polars Frame from wide to long form, where you describe how the data will be unpivoted, using a DataFrame. This gives you, the user, more control over the transformation to long form, using a spec DataFrame that describes exactly how data stored in the column names becomes variables.

It can come in handy for situations where pivot_longer seems inadequate for the transformation.

New in version 0.28.0

Examples:

>>> import pandas as pd
>>> from janitor.polars import pivot_longer_spec
>>> df = pl.DataFrame(
...     {
...         "Sepal.Length": [5.1, 5.9],
...         "Sepal.Width": [3.5, 3.0],
...         "Petal.Length": [1.4, 5.1],
...         "Petal.Width": [0.2, 1.8],
...         "Species": ["setosa", "virginica"],
...     }
... )
>>> df
shape: (2, 5)
┌──────────────┬─────────────┬──────────────┬─────────────┬───────────┐
│ Sepal.Length ┆ Sepal.Width ┆ Petal.Length ┆ Petal.Width ┆ Species   │
│ ---          ┆ ---         ┆ ---          ┆ ---         ┆ ---       │
│ f64          ┆ f64         ┆ f64          ┆ f64         ┆ str       │
╞══════════════╪═════════════╪══════════════╪═════════════╪═══════════╡
│ 5.1          ┆ 3.5         ┆ 1.4          ┆ 0.2         ┆ setosa    │
│ 5.9          ┆ 3.0         ┆ 5.1          ┆ 1.8         ┆ virginica │
└──────────────┴─────────────┴──────────────┴─────────────┴───────────┘
>>> spec = {'.name':['Sepal.Length','Petal.Length',
...                  'Sepal.Width','Petal.Width'],
...         '.value':['Length','Length','Width','Width'],
...         'part':['Sepal','Petal','Sepal','Petal']}
>>> spec = pl.DataFrame(spec)
>>> spec
shape: (4, 3)
┌──────────────┬────────┬───────┐
│ .name        ┆ .value ┆ part  │
│ ---          ┆ ---    ┆ ---   │
│ str          ┆ str    ┆ str   │
╞══════════════╪════════╪═══════╡
│ Sepal.Length ┆ Length ┆ Sepal │
│ Petal.Length ┆ Length ┆ Petal │
│ Sepal.Width  ┆ Width  ┆ Sepal │
│ Petal.Width  ┆ Width  ┆ Petal │
└──────────────┴────────┴───────┘
>>> df.pipe(pivot_longer_spec,spec=spec).sort(by=pl.all())
shape: (4, 4)
┌───────────┬───────┬────────┬───────┐
│ Species   ┆ part  ┆ Length ┆ Width │
│ ---       ┆ ---   ┆ ---    ┆ ---   │
│ str       ┆ str   ┆ f64    ┆ f64   │
╞═══════════╪═══════╪════════╪═══════╡
│ setosa    ┆ Petal ┆ 1.4    ┆ 0.2   │
│ setosa    ┆ Sepal ┆ 5.1    ┆ 3.5   │
│ virginica ┆ Petal ┆ 5.1    ┆ 1.8   │
│ virginica ┆ Sepal ┆ 5.9    ┆ 3.0   │
└───────────┴───────┴────────┴───────┘

Parameters:

Name Type Description Default
df DataFrame | LazyFrame

The source DataFrame to unpivot. It can also be a LazyFrame.

required
spec DataFrame

A specification DataFrame. At a minimum, the spec DataFrame must have a .name column and a .value column. The .name column should contain the columns in the source DataFrame that will be transformed to long form. The .value column gives the name of the column that the values in the source DataFrame will go into. Additional columns in the spec DataFrame should be named to match columns in the long format of the dataset and contain values corresponding to columns pivoted from the wide format. Note that these additional columns should not already exist in the source DataFrame.

required

Raises:

Type Description
KeyError

If .name or .value is missing from the spec's columns.

ValueError

If the labels in spec['.name'] is not unique.

Returns:

Type Description
DataFrame | LazyFrame

A polars DataFrame/LazyFrame.

Source code in janitor/polars/pivot_longer.py
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
def pivot_longer_spec(
    df: pl.DataFrame | pl.LazyFrame,
    spec: pl.DataFrame,
) -> pl.DataFrame | pl.LazyFrame:
    """
    A declarative interface to pivot a Polars Frame
    from wide to long form,
    where you describe how the data will be unpivoted,
    using a DataFrame. This gives you, the user,
    more control over the transformation to long form,
    using a *spec* DataFrame that describes exactly
    how data stored in the column names
    becomes variables.

    It can come in handy for situations where
    [`pivot_longer`][janitor.polars.pivot_longer.pivot_longer]
    seems inadequate for the transformation.

    !!! info "New in version 0.28.0"

    Examples:
        >>> import pandas as pd
        >>> from janitor.polars import pivot_longer_spec
        >>> df = pl.DataFrame(
        ...     {
        ...         "Sepal.Length": [5.1, 5.9],
        ...         "Sepal.Width": [3.5, 3.0],
        ...         "Petal.Length": [1.4, 5.1],
        ...         "Petal.Width": [0.2, 1.8],
        ...         "Species": ["setosa", "virginica"],
        ...     }
        ... )
        >>> df
        shape: (2, 5)
        ┌──────────────┬─────────────┬──────────────┬─────────────┬───────────┐
        │ Sepal.Length ┆ Sepal.Width ┆ Petal.Length ┆ Petal.Width ┆ Species   │
        │ ---          ┆ ---         ┆ ---          ┆ ---         ┆ ---       │
        │ f64          ┆ f64         ┆ f64          ┆ f64         ┆ str       │
        ╞══════════════╪═════════════╪══════════════╪═════════════╪═══════════╡
        │ 5.1          ┆ 3.5         ┆ 1.4          ┆ 0.2         ┆ setosa    │
        │ 5.9          ┆ 3.0         ┆ 5.1          ┆ 1.8         ┆ virginica │
        └──────────────┴─────────────┴──────────────┴─────────────┴───────────┘
        >>> spec = {'.name':['Sepal.Length','Petal.Length',
        ...                  'Sepal.Width','Petal.Width'],
        ...         '.value':['Length','Length','Width','Width'],
        ...         'part':['Sepal','Petal','Sepal','Petal']}
        >>> spec = pl.DataFrame(spec)
        >>> spec
        shape: (4, 3)
        ┌──────────────┬────────┬───────┐
        │ .name        ┆ .value ┆ part  │
        │ ---          ┆ ---    ┆ ---   │
        │ str          ┆ str    ┆ str   │
        ╞══════════════╪════════╪═══════╡
        │ Sepal.Length ┆ Length ┆ Sepal │
        │ Petal.Length ┆ Length ┆ Petal │
        │ Sepal.Width  ┆ Width  ┆ Sepal │
        │ Petal.Width  ┆ Width  ┆ Petal │
        └──────────────┴────────┴───────┘
        >>> df.pipe(pivot_longer_spec,spec=spec).sort(by=pl.all())
        shape: (4, 4)
        ┌───────────┬───────┬────────┬───────┐
        │ Species   ┆ part  ┆ Length ┆ Width │
        │ ---       ┆ ---   ┆ ---    ┆ ---   │
        │ str       ┆ str   ┆ f64    ┆ f64   │
        ╞═══════════╪═══════╪════════╪═══════╡
        │ setosa    ┆ Petal ┆ 1.4    ┆ 0.2   │
        │ setosa    ┆ Sepal ┆ 5.1    ┆ 3.5   │
        │ virginica ┆ Petal ┆ 5.1    ┆ 1.8   │
        │ virginica ┆ Sepal ┆ 5.9    ┆ 3.0   │
        └───────────┴───────┴────────┴───────┘

    Args:
        df: The source DataFrame to unpivot.
            It can also be a LazyFrame.
        spec: A specification DataFrame.
            At a minimum, the spec DataFrame
            must have a `.name` column
            and a `.value` column.
            The `.name` column  should contain the
            columns in the source DataFrame that will be
            transformed to long form.
            The `.value` column gives the name of the column
            that the values in the source DataFrame will go into.
            Additional columns in the spec DataFrame
            should be named to match columns
            in the long format of the dataset and contain values
            corresponding to columns pivoted from the wide format.
            Note that these additional columns should not already exist
            in the source DataFrame.

    Raises:
        KeyError: If `.name` or `.value` is missing from the spec's columns.
        ValueError: If the labels in `spec['.name']` is not unique.

    Returns:
        A polars DataFrame/LazyFrame.
    """
    check("spec", spec, [pl.DataFrame])
    if ".name" not in spec.columns:
        raise KeyError(
            "Kindly ensure the spec DataFrame has a `.name` column."
        )
    if ".value" not in spec.columns:
        raise KeyError(
            "Kindly ensure the spec DataFrame has a `.value` column."
        )
    if spec.select(pl.col(".name").is_duplicated().any()).item():
        raise ValueError("The labels in the `.name` column should be unique.")

    exclude = set(df.columns).intersection(spec.columns)
    if exclude:
        raise ValueError(
            f"Labels {*exclude, } in the spec dataframe already exist "
            "as column labels in the source dataframe. "
            "Kindly ensure the spec DataFrame's columns "
            "are not present in the source DataFrame."
        )
    index = [
        label for label in df.columns if label not in spec.get_column(".name")
    ]
    others = [
        label for label in spec.columns if label not in {".name", ".value"}
    ]
    variable_name = "".join(df.columns + spec.columns)
    variable_name = f"{variable_name}_"
    if others:
        dot_value_only = False
        expression = pl.struct(others).alias(variable_name)
        spec = spec.select(".name", ".value", expression)
    else:
        dot_value_only = True
        expression = pl.cum_count(".value").over(".value").alias(variable_name)
        spec = spec.with_columns(expression)
    return _pivot_longer_dot_value(
        df=df,
        index=index,
        spec=spec,
        variable_name=variable_name,
        dot_value_only=dot_value_only,
        names_transform=None,
    )

row_to_names

row_to_names implementation for polars.

row_to_names(df, row_numbers=0, remove_rows=False, remove_rows_above=False, separator='_')

Elevates a row, or rows, to be the column names of a DataFrame.

row_to_names can also be applied to a LazyFrame.

Examples:

Replace column names with the first row.

>>> import polars as pl
>>> import janitor.polars
>>> df = pl.DataFrame({
...     "a": ["nums", '6', '9'],
...     "b": ["chars", "x", "y"],
... })
>>> df
shape: (3, 2)
┌──────┬───────┐
│ a    ┆ b     │
│ ---  ┆ ---   │
│ str  ┆ str   │
╞══════╪═══════╡
│ nums ┆ chars │
│ 6    ┆ x     │
│ 9    ┆ y     │
└──────┴───────┘
>>> df.row_to_names(0, remove_rows=True)
shape: (2, 2)
┌──────┬───────┐
│ nums ┆ chars │
│ ---  ┆ ---   │
│ str  ┆ str   │
╞══════╪═══════╡
│ 6    ┆ x     │
│ 9    ┆ y     │
└──────┴───────┘
>>> df.row_to_names(row_numbers=[0,1], remove_rows=True)
shape: (1, 2)
┌────────┬─────────┐
│ nums_6 ┆ chars_x │
│ ---    ┆ ---     │
│ str    ┆ str     │
╞════════╪═════════╡
│ 9      ┆ y       │
└────────┴─────────┘

Remove rows above the elevated row and the elevated row itself.

>>> df = pl.DataFrame({
...     "a": ["bla1", "nums", '6', '9'],
...     "b": ["bla2", "chars", "x", "y"],
... })
>>> df
shape: (4, 2)
┌──────┬───────┐
│ a    ┆ b     │
│ ---  ┆ ---   │
│ str  ┆ str   │
╞══════╪═══════╡
│ bla1 ┆ bla2  │
│ nums ┆ chars │
│ 6    ┆ x     │
│ 9    ┆ y     │
└──────┴───────┘
>>> df.row_to_names(1, remove_rows=True, remove_rows_above=True)
shape: (2, 2)
┌──────┬───────┐
│ nums ┆ chars │
│ ---  ┆ ---   │
│ str  ┆ str   │
╞══════╪═══════╡
│ 6    ┆ x     │
│ 9    ┆ y     │
└──────┴───────┘

New in version 0.28.0

Parameters:

Name Type Description Default
row_numbers int | list

Position of the row(s) containing the variable names. Note that indexing starts from 0. It can also be a list. Defaults to 0 (first row).

0
remove_rows bool

Whether the row(s) should be removed from the DataFrame.

False
remove_rows_above bool

Whether the row(s) above the selected row should be removed from the DataFrame.

False
separator str

Combines the labels into a single string, if row_numbers is a list of integers. Default is '_'.

'_'

Returns:

Type Description
DataFrame | LazyFrame

A polars DataFrame/LazyFrame.

Source code in janitor/polars/row_to_names.py
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
@register_lazyframe_method
@register_dataframe_method
def row_to_names(
    df: pl.DataFrame | pl.LazyFrame,
    row_numbers: int | list = 0,
    remove_rows: bool = False,
    remove_rows_above: bool = False,
    separator: str = "_",
) -> pl.DataFrame | pl.LazyFrame:
    """
    Elevates a row, or rows, to be the column names of a DataFrame.

    `row_to_names` can also be applied to a LazyFrame.

    Examples:
        Replace column names with the first row.

        >>> import polars as pl
        >>> import janitor.polars
        >>> df = pl.DataFrame({
        ...     "a": ["nums", '6', '9'],
        ...     "b": ["chars", "x", "y"],
        ... })
        >>> df
        shape: (3, 2)
        ┌──────┬───────┐
        │ a    ┆ b     │
        │ ---  ┆ ---   │
        │ str  ┆ str   │
        ╞══════╪═══════╡
        │ nums ┆ chars │
        │ 6    ┆ x     │
        │ 9    ┆ y     │
        └──────┴───────┘
        >>> df.row_to_names(0, remove_rows=True)
        shape: (2, 2)
        ┌──────┬───────┐
        │ nums ┆ chars │
        │ ---  ┆ ---   │
        │ str  ┆ str   │
        ╞══════╪═══════╡
        │ 6    ┆ x     │
        │ 9    ┆ y     │
        └──────┴───────┘
        >>> df.row_to_names(row_numbers=[0,1], remove_rows=True)
        shape: (1, 2)
        ┌────────┬─────────┐
        │ nums_6 ┆ chars_x │
        │ ---    ┆ ---     │
        │ str    ┆ str     │
        ╞════════╪═════════╡
        │ 9      ┆ y       │
        └────────┴─────────┘

        Remove rows above the elevated row and the elevated row itself.

        >>> df = pl.DataFrame({
        ...     "a": ["bla1", "nums", '6', '9'],
        ...     "b": ["bla2", "chars", "x", "y"],
        ... })
        >>> df
        shape: (4, 2)
        ┌──────┬───────┐
        │ a    ┆ b     │
        │ ---  ┆ ---   │
        │ str  ┆ str   │
        ╞══════╪═══════╡
        │ bla1 ┆ bla2  │
        │ nums ┆ chars │
        │ 6    ┆ x     │
        │ 9    ┆ y     │
        └──────┴───────┘
        >>> df.row_to_names(1, remove_rows=True, remove_rows_above=True)
        shape: (2, 2)
        ┌──────┬───────┐
        │ nums ┆ chars │
        │ ---  ┆ ---   │
        │ str  ┆ str   │
        ╞══════╪═══════╡
        │ 6    ┆ x     │
        │ 9    ┆ y     │
        └──────┴───────┘

    !!! info "New in version 0.28.0"

    Args:
        row_numbers: Position of the row(s) containing the variable names.
            Note that indexing starts from 0. It can also be a list.
            Defaults to 0 (first row).
        remove_rows: Whether the row(s) should be removed from the DataFrame.
        remove_rows_above: Whether the row(s) above the selected row should
            be removed from the DataFrame.
        separator: Combines the labels into a single string,
            if row_numbers is a list of integers. Default is '_'.

    Returns:
        A polars DataFrame/LazyFrame.
    """  # noqa: E501
    return _row_to_names(
        df=df,
        row_numbers=row_numbers,
        remove_rows=remove_rows,
        remove_rows_above=remove_rows_above,
        separator=separator,
    )