Skip to content
kd
7 Jun 2025
Back to blog

ibis - a better pandas

4 min read (635 words)

Table of contents

Code
from IPython.core.interactiveshell import InteractiveShell
# `ast_node_interactivity` is a setting that determines how the return value of the last line in a cell is displayed
# with `last_expr_or_assign`, the return value of the last expression is displayed unless it is assigned to a variable
InteractiveShell.ast_node_interactivity = "last_expr_or_assign"

There's an excellent blog post on why Pandas feels clunky for those coming from R.

However in Python, I've found ibis as an alternative to pandas to be a much more natural fit for those coming from R.

ibis uses duckdb as a backend by default, and its API is a mix between duckdb and dplyr.

import ibis

_ in ibis is a special variable that refers to the last expression evaluated this is useful for chaining operations or for using the result of the last expression in subsequent operations

from ibis import _

By default, ibis defers execution until you call execute(). Using ibis.options.interactive = True will make it so that expressions are immediately executed when displayed. This is useful for interactive exploration.

ibis.options.interactive = True

Let's also import pandas to compare the two libraries.

import pandas as pd

Here's the equivalent code in pandas and ibis for the example provided in the blog post:

pandas_df = pd.read_csv("purchases.csv")
pandas_df.head()
countryamountdiscount
0USA200010
1USA350015
2USA300020
3Canada12012
4Canada18018
df = ibis.read_csv("purchases.csv")
df.head()
┏━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┓
┃ country  amount  discount ┃
┡━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━┩
│ stringint64int64    │
├─────────┼────────┼──────────┤
│ USA    200010 │
│ USA    350015 │
│ USA    300020 │
│ Canada 12012 │
│ Canada 18018 │
└─────────┴────────┴──────────┘

"How much do we sell..? Let's take the total sum!"

pandas

pandas_df.amount.sum()
np.int64(17210)

ibis

df.amount.sum().execute()
17210

"Ah, they wanted it by country..."

pandas

(
pandas_df
.groupby("country")
.agg(total=("amount", "sum"))
.reset_index()
)
countrytotal
0Australia600
1Brazil460
2Canada3400
3France500
4Germany570
5India720
6Italy630
7Japan690
8Spain660
9UK480
10USA8500

ibis

(
df
.group_by("country")
.aggregate(total=_.amount.sum())
.order_by("country") # optional, to align with the pandas output
)
┏━━━━━━━━━━━┳━━━━━━━┓
┃ country    total ┃
┡━━━━━━━━━━━╇━━━━━━━┩
│ stringint64 │
├───────────┼───────┤
│ Australia600 │
│ Brazil   460 │
│ Canada   3400 │
│ France   500 │
│ Germany  570 │
│ India    720 │
│ Italy    630 │
│ Japan    690 │
│ Spain    660 │
│ UK       480 │
│  │
└───────────┴───────┘

Calling .execute() will run the query and return the result as a pandas DataFrame.

type(
(
df
.group_by("country")
.aggregate(total=_.amount.sum())
.order_by("country") # optional, to align with the pandas output
).execute()
)
pandas.core.frame.DataFrame

"And I guess I should deduct the discount."

pandas

(
pandas_df
.groupby("country")[["amount", "discount"]]
.apply(lambda df: (df["amount"] - df["discount"]).sum())
.reset_index()
.rename(columns={0: "total"})
)
countrytotal
0Australia540
1Brazil414
2Canada3349
3France450
4Germany513
5India648
6Italy567
7Japan621
8Spain594
9UK432
10USA8455

ibis

(
df
.group_by("country")
.aggregate(total=(_.amount - _.discount).sum())
.order_by("country")
)
┏━━━━━━━━━━━┳━━━━━━━┓
┃ country    total ┃
┡━━━━━━━━━━━╇━━━━━━━┩
│ stringint64 │
├───────────┼───────┤
│ Australia540 │
│ Brazil   414 │
│ Canada   3349 │
│ France   450 │
│ Germany  513 │
│ India    648 │
│ Italy    567 │
│ Japan    621 │
│ Spain    594 │
│ UK       432 │
│  │
└───────────┴───────┘

"Oh, and Maria asked me to remove any outliers."

pandas

(
pandas_df
.query("amount <= amount.median() * 10")
.groupby("country")[["amount", "discount"]]
.apply(lambda df: (df["amount"] - df["discount"]).sum())
.reset_index()
.rename(columns={0: "total"})
)
countrytotal
0Australia540
1Brazil414
2Canada270
3France450
4Germany513
5India648
6Italy567
7Japan621
8Spain594
9UK432
10USA1990

ibis

(
df
.mutate(median=_.amount.median())
.filter(_.amount <= _.median * 10)
.group_by("country")
.aggregate(total=(_.amount - _.discount).sum())
.order_by("country")
)
┏━━━━━━━━━━━┳━━━━━━━┓
┃ country    total ┃
┡━━━━━━━━━━━╇━━━━━━━┩
│ stringint64 │
├───────────┼───────┤
│ Australia540 │
│ Brazil   414 │
│ Canada   270 │
│ France   450 │
│ Germany  513 │
│ India    648 │
│ Italy    567 │
│ Japan    621 │
│ Spain    594 │
│ UK       432 │
│  │
└───────────┴───────┘

"I probably should use the median within each country"

pandas

(
pandas_df
.assign(country_median=lambda df:
df.groupby("country")["amount"].transform("median")
)
.query("amount <= country_median * 10")
.groupby("country")[["amount", "discount"]]
.apply(lambda df: (df["amount"] - df["discount"]).sum())
.reset_index()
.rename(columns={0: "total"})
)
countrytotal
0Australia540
1Brazil414
2Canada270
3France450
4Germany513
5India648
6Italy567
7Japan621
8Spain594
9UK432
10USA8455

ibis

For this last example, we have to resort to calculating the median after a group by operation over each country and then join it back to the original DataFrame to replace the outliers. This is similar to the pandas approach.

df.group_by("country").aggregate(median=_.amount.median())
┏━━━━━━━━━━━┳━━━━━━━━━┓
┃ country    median  ┃
┡━━━━━━━━━━━╇━━━━━━━━━┩
│ stringfloat64 │
├───────────┼─────────┤
│ USA      3000.0 │
│ Germany  200.0 │
│ India    250.0 │
│ Spain    230.0 │
│ Japan    240.0 │
│ Italy    220.0 │
│ Brazil   230.0 │
│ Australia210.0 │
│ UK       160.0 │
│ Canada   180.0 │
│  │
└───────────┴─────────┘

Here's how you can do it in a single expression in ibis:

(
df
.join(
df.group_by("country").aggregate(median=_.amount.median()),
predicates=["country"],
)
.filter(_.amount <= _.median * 10)
.group_by("country")
.aggregate(total=(_.amount - _.discount).sum())
.order_by("country")
)
┏━━━━━━━━━━━┳━━━━━━━┓
┃ country    total ┃
┡━━━━━━━━━━━╇━━━━━━━┩
│ stringint64 │
├───────────┼───────┤
│ Australia540 │
│ Brazil   414 │
│ Canada   270 │
│ France   450 │
│ Germany  513 │
│ India    648 │
│ Italy    567 │
│ Japan    621 │
│ Spain    594 │
│ UK       432 │
│  │
└───────────┴───────┘

Citation

@online{krishnamurthy2025ibisbetterpandas,
  author = {Dheepak Krishnamurthy},
  title = {ibis - a better pandas},
  year = {2025},
  date = {2025-06-07},
  url = {https://kdheepak.com/blog/ibis-better-pandas/},
  langid = {en},
}

For attribution, please cite this work as:

Dheepak Krishnamurthy, "ibis - a better pandas", June 7, 2025 https://kdheepak.com/blog/ibis-better-pandas/


Building Dashboards using Param and Panel in Python