Skip to content
kd
7 Jun 2025
Back to blog

ibis - a better pandas

4 min read (635 words)

Table of contents

Code
from IPython.core.interactiveshell import InteractiveShell
# `ast_node_interactivity` is a setting that determines how the return value of the last line in a cell is displayed
# with `last_expr_or_assign`, the return value of the last expression is displayed unless it is assigned to a variable
InteractiveShell.ast_node_interactivity = "last_expr_or_assign"

There's an excellent blog post on why Pandas feels clunky for those coming from R.

However in Python, I've found ibis as an alternative to pandas to be a much more natural fit for those coming from R.

ibis uses duckdb as a backend by default, and its API is a mix between duckdb and dplyr.

import ibis

_ in ibis is a special variable that refers to the last expression evaluated this is useful for chaining operations or for using the result of the last expression in subsequent operations

from ibis import _

By default, ibis defers execution until you call execute(). Using ibis.options.interactive = True will make it so that expressions are immediately executed when displayed. This is useful for interactive exploration.

ibis.options.interactive = True

Let's also import pandas to compare the two libraries.

import pandas as pd

Here's the equivalent code in pandas and ibis for the example provided in the blog post:

pandas_df = pd.read_csv("purchases.csv")
pandas_df.head()

:::div{.cell-output .cell-output-display} | | country | amount | discount | | --- | --- | --- | --- | | 0 | USA | 2000 | 10 | | 1 | USA | 3500 | 15 | | 2 | USA | 3000 | 20 | | 3 | Canada | 120 | 12 | | 4 | Canada | 180 | 18 | ::: ::::

df = ibis.read_csv("purchases.csv")
df.head()
┏━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┓
┃ country  amount  discount ┃
┡━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━┩
│ stringint64int64    │
├─────────┼────────┼──────────┤
│ USA    200010 │
│ USA    350015 │
│ USA    300020 │
│ Canada 12012 │
│ Canada 18018 │
└─────────┴────────┴──────────┘

::: ::::

"How much do we sell..? Let's take the total sum!"

pandas

pandas_df.amount.sum()

:::div{.cell-output .cell-output-display} np.int64(17210) ::: ::::

ibis

df.amount.sum().execute()

:::div{.cell-output .cell-output-display} 17210 ::: ::::

"Ah, they wanted it by country..."

pandas

(
pandas_df
.groupby("country")
.agg(total=("amount", "sum"))
.reset_index()
)

:::div{.cell-output .cell-output-display} | | country | total | | --- | --- | --- | | 0 | Australia | 600 | | 1 | Brazil | 460 | | 2 | Canada | 3400 | | 3 | France | 500 | | 4 | Germany | 570 | | 5 | India | 720 | | 6 | Italy | 630 | | 7 | Japan | 690 | | 8 | Spain | 660 | | 9 | UK | 480 | | 10 | USA | 8500 | :::

ibis

(
df
.group_by("country")
.aggregate(total=_.amount.sum())
.order_by("country") # optional, to align with the pandas output
)
┏━━━━━━━━━━━┳━━━━━━━┓
┃ country    total ┃
┡━━━━━━━━━━━╇━━━━━━━┩
│ stringint64 │
├───────────┼───────┤
│ Australia600 │
│ Brazil   460 │
│ Canada   3400 │
│ France   500 │
│ Germany  570 │
│ India    720 │
│ Italy    630 │
│ Japan    690 │
│ Spain    660 │
│ UK       480 │
│  │
└───────────┴───────┘

::: ::::

Calling .execute() will run the query and return the result as a pandas DataFrame.

type(
(
df
.group_by("country")
.aggregate(total=_.amount.sum())
.order_by("country") # optional, to align with the pandas output
).execute()
)

:::div{.cell-output .cell-output-display} pandas.core.frame.DataFrame ::: ::::

"And I guess I should deduct the discount."

pandas

(
pandas_df
.groupby("country")[["amount", "discount"]]
.apply(lambda df: (df["amount"] - df["discount"]).sum())
.reset_index()
.rename(columns={0: "total"})
)

:::div{.cell-output .cell-output-display} | | country | total | | --- | --- | --- | | 0 | Australia | 540 | | 1 | Brazil | 414 | | 2 | Canada | 3349 | | 3 | France | 450 | | 4 | Germany | 513 | | 5 | India | 648 | | 6 | Italy | 567 | | 7 | Japan | 621 | | 8 | Spain | 594 | | 9 | UK | 432 | | 10 | USA | 8455 | :::

ibis

(
df
.group_by("country")
.aggregate(total=(_.amount - _.discount).sum())
.order_by("country")
)
┏━━━━━━━━━━━┳━━━━━━━┓
┃ country    total ┃
┡━━━━━━━━━━━╇━━━━━━━┩
│ stringint64 │
├───────────┼───────┤
│ Australia540 │
│ Brazil   414 │
│ Canada   3349 │
│ France   450 │
│ Germany  513 │
│ India    648 │
│ Italy    567 │
│ Japan    621 │
│ Spain    594 │
│ UK       432 │
│  │
└───────────┴───────┘

::: ::::

"Oh, and Maria asked me to remove any outliers."

pandas

(
pandas_df
.query("amount <= amount.median() * 10")
.groupby("country")[["amount", "discount"]]
.apply(lambda df: (df["amount"] - df["discount"]).sum())
.reset_index()
.rename(columns={0: "total"})
)

:::div{.cell-output .cell-output-display} | | country | total | | --- | --- | --- | | 0 | Australia | 540 | | 1 | Brazil | 414 | | 2 | Canada | 270 | | 3 | France | 450 | | 4 | Germany | 513 | | 5 | India | 648 | | 6 | Italy | 567 | | 7 | Japan | 621 | | 8 | Spain | 594 | | 9 | UK | 432 | | 10 | USA | 1990 | :::

ibis

(
df
.mutate(median=_.amount.median())
.filter(_.amount <= _.median * 10)
.group_by("country")
.aggregate(total=(_.amount - _.discount).sum())
.order_by("country")
)
┏━━━━━━━━━━━┳━━━━━━━┓
┃ country    total ┃
┡━━━━━━━━━━━╇━━━━━━━┩
│ stringint64 │
├───────────┼───────┤
│ Australia540 │
│ Brazil   414 │
│ Canada   270 │
│ France   450 │
│ Germany  513 │
│ India    648 │
│ Italy    567 │
│ Japan    621 │
│ Spain    594 │
│ UK       432 │
│  │
└───────────┴───────┘

::: ::::

"I probably should use the median within each country"

pandas

(
pandas_df
.assign(country_median=lambda df:
df.groupby("country")["amount"].transform("median")
)
.query("amount <= country_median * 10")
.groupby("country")[["amount", "discount"]]
.apply(lambda df: (df["amount"] - df["discount"]).sum())
.reset_index()
.rename(columns={0: "total"})
)

:::div{.cell-output .cell-output-display} | | country | total | | --- | --- | --- | | 0 | Australia | 540 | | 1 | Brazil | 414 | | 2 | Canada | 270 | | 3 | France | 450 | | 4 | Germany | 513 | | 5 | India | 648 | | 6 | Italy | 567 | | 7 | Japan | 621 | | 8 | Spain | 594 | | 9 | UK | 432 | | 10 | USA | 8455 | :::

ibis

For this last example, we have to resort to calculating the median after a group by operation over each country and then join it back to the original DataFrame to replace the outliers. This is similar to the pandas approach.

df.group_by("country").aggregate(median=_.amount.median())
┏━━━━━━━━━━━┳━━━━━━━━━┓
┃ country    median  ┃
┡━━━━━━━━━━━╇━━━━━━━━━┩
│ stringfloat64 │
├───────────┼─────────┤
│ USA      3000.0 │
│ Germany  200.0 │
│ India    250.0 │
│ Spain    230.0 │
│ Japan    240.0 │
│ Italy    220.0 │
│ Brazil   230.0 │
│ Australia210.0 │
│ UK       160.0 │
│ Canada   180.0 │
│  │
└───────────┴─────────┘

::: ::::

Here's how you can do it in a single expression in ibis:

(
df
.join(
df.group_by("country").aggregate(median=_.amount.median()),
predicates=["country"],
)
.filter(_.amount <= _.median * 10)
.group_by("country")
.aggregate(total=(_.amount - _.discount).sum())
.order_by("country")
)
┏━━━━━━━━━━━┳━━━━━━━┓
┃ country    total ┃
┡━━━━━━━━━━━╇━━━━━━━┩
│ stringint64 │
├───────────┼───────┤
│ Australia540 │
│ Brazil   414 │
│ Canada   270 │
│ France   450 │
│ Germany  513 │
│ India    648 │
│ Italy    567 │
│ Japan    621 │
│ Spain    594 │
│ UK       432 │
│  │
└───────────┴───────┘

::: ::::


Citation

@online{krishnamurthy2025ibisbetterpandas,
  author = {Dheepak Krishnamurthy},
  title = {ibis - a better pandas},
  year = {2025},
  date = {2025-06-07},
  url = {https://kdheepak.com/blog/ibis-better-pandas/},
  langid = {en},
}

For attribution, please cite this work as:

Dheepak Krishnamurthy, "ibis - a better pandas", June 7, 2025 https://kdheepak.com/blog/ibis-better-pandas/


Building Dashboards using Param and Panel in Python