Show the code
import pandas as pd
import numpy as np
import json
from lets_plot import *
from lets_plot.mapping import *
LetsPlot.setup_html(isolated_frame=True)Course DS 250
Markuss Saule
def coerce_numeric(s: pd.Series) -> pd.Series:
missing_tokens = {"", " ", "NA", "N/A", "NULL", "null", "None", "none", "nan", "NaN"}
s = s.apply(lambda x: np.nan if (isinstance(x, str) and x.strip() in missing_tokens) else x)
s = s.replace([-999, -999.0, "-999", "-999.0"], np.nan)
def strip_plus(x):
if isinstance(x, str):
x = x.strip()
return x[:-1] if x.endswith("+") else x
return x
s = s.apply(strip_plus)
return pd.to_numeric(s, errors="coerce")
# 1) Convert numeric columns
id_cols = ["airport_code", "airport_name", "month"]
num_cols = [c for c in df_raw.columns if c not in id_cols]
df = df_raw.copy()
for c in num_cols:
df[c] = coerce_numeric(df[c])
# 2) Clean + standardize month
df["month"] = (
df["month"]
.astype("string")
.str.strip()
.str.replace(r"\s+", " ", regex=True)
.str.lower()
)
# normalize known missing tokens
df.loc[df["month"].isin(["n/a", "na", "null", "none", "nan", ""]), "month"] = pd.NA
# fix dataset-specific spelling issue BEFORE title-casing
df["month"] = df["month"].replace({"febuary": "february"})
# final casing to match month_order
df["month"] = df["month"].str.title()
df["airport_name"] = (
df["airport_name"]
.astype("string")
.str.strip()
.str.replace(r"\s+", " ", regex=True)
)
df.loc[df["airport_name"].str.lower().isin(["n/a", "na", "null", "none", "nan", ""]), "airport_name"] = pd.NA
# build code -> name map from any non-missing airport_name values
code_to_name = (
df.dropna(subset=["airport_name"])
.drop_duplicates(subset=["airport_code", "airport_name"])
.groupby("airport_code")["airport_name"]
.first()
.to_dict()
)
# fallback for anything still missing (main one is IAD)
fallback_names = {
"IAD": "Washington, DC: Washington Dulles International"
}
df["airport_name"] = df["airport_name"].fillna(df["airport_code"].map(code_to_name))
df["airport_name"] = df["airport_name"].fillna(df["airport_code"].map(fallback_names))# -----------------------------
# Build summaries used in the report + elevator pitch
# -----------------------------
# Airport summary (Task 2)
airport_summary = (
df.groupby("airport_code", as_index=False)
.agg(
airport_name=("airport_name", "first"),
total_flights=("num_of_flights_total", "sum"),
delayed_flights=("num_of_delays_total", "sum"),
total_delay_minutes=("minutes_delayed_total", "sum"),
)
)
airport_summary["prop_delayed"] = airport_summary["delayed_flights"] / airport_summary["total_flights"]
airport_summary["avg_delay_hours_per_delayed_flight"] = (airport_summary["total_delay_minutes"] / airport_summary["delayed_flights"]) / 60
airport_summary["expected_delay_hours_per_flight"] = (airport_summary["total_delay_minutes"] / airport_summary["total_flights"]) / 60
worst_airport = airport_summary.sort_values("expected_delay_hours_per_flight", ascending=False).iloc[0]
best_airport = airport_summary.sort_values("expected_delay_hours_per_flight", ascending=True).iloc[0]
# Month summary (Task 3)
month_order = ["January","February","March","April","May","June","July","August","September","October","November","December"]
df_month = df.dropna(subset=["month"]).copy()
df_month["month"] = df_month["month"].astype(str)
month_summary = (
df_month.groupby("month", as_index=False, observed=False)
.agg(
total_flights=("num_of_flights_total", "sum"),
delayed_flights=("num_of_delays_total", "sum"),
total_delay_minutes=("minutes_delayed_total", "sum"),
)
)
month_summary["prop_delayed"] = np.where(
month_summary["total_flights"] > 0,
month_summary["delayed_flights"] / month_summary["total_flights"],
np.nan
)
valid_months = month_summary.dropna(subset=["prop_delayed"]).copy()
best_month = valid_months.sort_values("prop_delayed", ascending=True).iloc[0]
worst_month = valid_months.sort_values("prop_delayed", ascending=False).iloc[0]Across the 10-year window and 7 airports, delay performance isn’t evenly distributed. The worst airport by expected delay burden per flight adds about 0.271 hours of delay per flight on average, while the best airport is closer to 0.120 hours.
Seasonality matters too: the lowest “any-delay” risk shows up in September (16.45% of flights delayed), while December is the roughest (25.68%). If your goal is simply avoid delays, the month you fly is one of the easiest levers you can pull.
Fix all of the varied missing data types in the data to be consistent (all missing values should be displayed as “NaN”).
In your report include one record example (one row) from your new data, in the raw JSON format. Your example should display the “NaN” for at least one missing value.
This dataset contains multiple “missing” encodings (numeric sentinels like -999, blank-like strings, and nulls). That mix is risky because summaries can silently treat “missing” as a real value and distort totals and averages. I standardized all missing variants to NaN and converted numeric-looking strings (like 1500+) into valid numeric values, so every downstream calculation behaves consistently.
{
"airport_code": "ATL",
"airport_name": "Atlanta, GA: Hartsfield-Jackson Atlanta International",
"month": "January",
"year": 2005.0,
"num_of_flights_total": 35048,
"num_of_delays_carrier": 1500,
"num_of_delays_late_aircraft": "NaN",
"num_of_delays_nas": 4598,
"num_of_delays_security": 10,
"num_of_delays_weather": 448,
"num_of_delays_total": 8355,
"minutes_delayed_carrier": 116423.0,
"minutes_delayed_late_aircraft": 104415,
"minutes_delayed_nas": 207467.0,
"minutes_delayed_security": 297,
"minutes_delayed_weather": 36931,
"minutes_delayed_total": 465533
}
Which airport has the worst delays? Describe the metric you chose, and why you chose it to determine the “worst” airport.
Include a summary table with: total flights, total delayed flights, proportion delayed, and average delay time in hours.
A delay rate alone can be misleading (an airport might delay fewer flights but for much longer). To reflect both frequency and severity, I used:
(total delay minutes / total flights) / 60This is a practical “time tax” metric: on average, how many hours of delay does a traveler absorb per flight when routing through each airport?
airport_table = (
airport_summary
.sort_values("expected_delay_hours_per_flight", ascending=False)
.assign(
prop_delayed_pct=lambda d: (d["prop_delayed"] * 100).round(2),
avg_delay_hours_per_delayed_flight=lambda d: d["avg_delay_hours_per_delayed_flight"].round(3),
expected_delay_hours_per_flight=lambda d: d["expected_delay_hours_per_flight"].round(4),
)
[["airport_code", "airport_name", "total_flights", "delayed_flights",
"prop_delayed_pct", "avg_delay_hours_per_delayed_flight", "expected_delay_hours_per_flight"]]
)
airport_table| airport_code | airport_name | total_flights | delayed_flights | prop_delayed_pct | avg_delay_hours_per_delayed_flight | expected_delay_hours_per_flight | |
|---|---|---|---|---|---|---|---|
| 5 | SFO | San Francisco, CA: San Francisco International | 1630945 | 425604 | 26.10 | 1.040 | 0.2713 |
| 3 | ORD | Chicago, IL: Chicago O'Hare International | 3597588 | 830825 | 23.09 | 1.131 | 0.2611 |
| 0 | ATL | Atlanta, GA: Hartsfield-Jackson Atlanta Intern... | 4430047 | 902443 | 20.37 | 0.997 | 0.2031 |
| 2 | IAD | Washington, DC: Washington Dulles International | 851571 | 168467 | 19.78 | 1.017 | 0.2013 |
| 1 | DEN | Denver, CO: Denver International | 2513974 | 468519 | 18.64 | 0.895 | 0.1669 |
| 4 | SAN | San Diego, CA: San Diego International | 917862 | 175132 | 19.08 | 0.788 | 0.1503 |
| 6 | SLC | Salt Lake City, UT: Salt Lake City International | 1403384 | 205160 | 14.62 | 0.822 | 0.1202 |
airport_plot = airport_summary[["airport_code", "expected_delay_hours_per_flight"]].copy()
order = (
airport_plot.sort_values("expected_delay_hours_per_flight", ascending=False)
["airport_code"]
.astype(str)
.tolist()
)
airport_plot["airport"] = airport_plot["airport_code"].astype(str)
max_y = float(airport_plot["expected_delay_hours_per_flight"].max())
y_top = max_y * 1.25
airport_plot["label"] = airport_plot["expected_delay_hours_per_flight"].round(3).astype(str)
plot_airport = (
ggplot(airport_plot, aes(x="airport", y="expected_delay_hours_per_flight")) +
geom_bar(stat="identity", width=0.75, alpha=0.9) +
geom_text(aes(label="label"), va="bottom", nudge_y=max_y*0.05, size=9) +
scale_x_discrete(limits=order) +
coord_cartesian(ylim=[0, y_top]) +
labs(
title="Expected Delay Burden per Flight by Airport",
subtitle="Total delay minutes ÷ total flights (converted to hours)",
x="Airport",
y="Expected delay hours per flight"
) +
theme_minimal() +
theme(
axis_title=element_text(size=12),
plot_title=element_text(size=16),
plot_subtitle=element_text(size=12)
)
)
plot_airportConclusion (Task 2): The worst airport by expected delay burden is SFO. This airport is not necessarily the busiest, but it produces the highest average time loss per booked flight. Operationally, that makes it the highest-leverage improvement target: reducing delay minutes here saves more “hours per traveler” than the same improvement at a lower-burden airport. For travelers, this is the airport where scheduling buffers (or route alternatives) matter most.
What is the best month to fly if you want to avoid delays of any length?
Describe the metric you chose and why you chose it. Include one chart to help support your answer, with the x-axis ordered by month. (Rows missing Month are removed.)
If the goal is “avoid delays of any length,” the cleanest measure is binary: delayed vs not delayed.
delayed flights / total flightsThis treats a 5-minute delay and a 2-hour delay the same—because either one breaks the “no delay” goal.
month_summary_display = (
month_summary.assign(
prop_delayed_pct=(month_summary["prop_delayed"] * 100).round(2),
avg_delay_hours_per_delayed_flight=((month_summary["total_delay_minutes"] / month_summary["delayed_flights"]) / 60).round(3),
)[["month", "total_flights", "delayed_flights", "prop_delayed_pct", "avg_delay_hours_per_delayed_flight"]]
)
month_summary_display| month | total_flights | delayed_flights | prop_delayed_pct | avg_delay_hours_per_delayed_flight | |
|---|---|---|---|---|---|
| 0 | April | 1259723 | 231408 | 18.37 | 0.984 |
| 1 | August | 1335158 | 279699 | 20.95 | 1.007 |
| 2 | December | 1180278 | 303133 | 25.68 | 1.035 |
| 3 | February | 1115814 | 248033 | 22.23 | 0.991 |
| 4 | January | 1193018 | 265001 | 22.21 | 1.016 |
| 5 | July | 1371741 | 319960 | 23.33 | 1.066 |
| 6 | June | 1305663 | 317895 | 24.35 | 1.066 |
| 7 | March | 1213370 | 250142 | 20.62 | 0.996 |
| 8 | May | 1227795 | 233494 | 19.02 | 0.973 |
| 9 | November | 1185434 | 197768 | 16.68 | 0.936 |
| 10 | October | 1301612 | 235166 | 18.07 | 0.929 |
| 11 | September | 1227208 | 201905 | 16.45 | 0.949 |
month_plot = month_summary.copy()
# For plotting, show months with no computable proportion as 0-height bars so the x-axis remains complete.
month_plot["prop_delayed_plot"] = month_plot["prop_delayed"].fillna(0)
month_plot["label"] = np.where(
month_plot["month"].isin([best_month["month"], worst_month["month"]]),
(month_plot["prop_delayed"] * 100).round(2).astype(str) + "%",
""
)
max_y = float(month_plot["prop_delayed_plot"].max())
y_top = max_y * 1.30
plot_month = (
ggplot(month_plot, aes(x="month", y="prop_delayed_plot")) +
geom_bar(stat="identity", width=0.75, alpha=0.9) +
geom_text(aes(label="label"), va="bottom", nudge_y=max_y * 0.05, size=9) +
scale_x_discrete(limits=month_order) +
coord_cartesian(ylim=[0, y_top]) +
scale_y_continuous(format=".0%") +
labs(
title="Proportion of Flights Delayed by Month",
subtitle="X-axis is kept in calendar order; labels highlight the lowest- and highest-delay months",
x="Month",
y="Proportion delayed"
) +
theme_minimal() +
theme(
axis_text_x=element_text(angle=30, hjust=1),
axis_title=element_text(size=12),
plot_title=element_text(size=16),
plot_subtitle=element_text(size=12),
plot_margin=[10, 35, 10, 10]
) +
ggsize(950, 520)
)
plot_monthConclusion (Task 3): The best month to fly (lowest chance of any delay) is September at about 16.45% delayed. The worst month is December at 25.68%. Practically, month choice is a “free” optimization: it can reduce delay risk without changing airports or airlines. If a trip is flexible, shifting from the worst month toward the best month is one of the simplest ways to improve on-time reliability.
BTS notes that the “Weather” category reflects severe weather only. Mild weather-related delays are often recorded inside NAS and Late-Arriving Aircraft.
Create a new column estimating total weather delays using:
Also mean-impute missing Late-Arriving delay counts and show the first 5 rows.
This produces a weather measure that is closer to the real operational footprint of weather: not just storms, but also weather-driven ripple effects recorded as system or late-arrival delays.
late_mean = df["num_of_delays_late_aircraft"].mean(skipna=True)
df["late_aircraft_imputed"] = df["num_of_delays_late_aircraft"].fillna(late_mean)
def nas_frac(m):
if pd.isna(m):
return np.nan
return 0.40 if m in {"April","May","June","July","August"} else 0.65
df["nas_weather_frac"] = df["month"].apply(nas_frac)
df["weather_delays_est"] = (
df["num_of_delays_weather"] +
0.30 * df["late_aircraft_imputed"] +
df["nas_weather_frac"] * df["num_of_delays_nas"]
)
df[[
"airport_code", "month", "year",
"num_of_flights_total",
"num_of_delays_weather",
"num_of_delays_late_aircraft", "late_aircraft_imputed",
"num_of_delays_nas", "nas_weather_frac",
"weather_delays_est"
]].head(5)| airport_code | month | year | num_of_flights_total | num_of_delays_weather | num_of_delays_late_aircraft | late_aircraft_imputed | num_of_delays_nas | nas_weather_frac | weather_delays_est | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ATL | January | 2005.0 | 35048 | 448 | NaN | 1109.104072 | 4598 | 0.65 | 3769.431222 |
| 1 | DEN | January | 2005.0 | 12687 | 233 | 928.0 | 928.000000 | 935 | 0.65 | 1119.150000 |
| 2 | IAD | January | 2005.0 | 12381 | 61 | 1058.0 | 1058.000000 | 895 | 0.65 | 960.150000 |
| 3 | ORD | January | 2005.0 | 28194 | 306 | 2255.0 | 2255.000000 | 5415 | 0.65 | 4502.250000 |
| 4 | SAN | January | 2005.0 | 7283 | 56 | 680.0 | 680.000000 | 638 | 0.65 | 674.700000 |
missing_before = int(df["num_of_delays_late_aircraft"].isna().sum())
plot_weather_dist = (
ggplot(df.dropna(subset=["weather_delays_est"]), aes(x="weather_delays_est")) +
geom_histogram(bins=30, alpha=0.9) +
labs(
title="Distribution of Estimated Weather-Related Delayed Flights",
subtitle=f"Severe weather + estimated mild weather (Late-Arriving mean-imputed; missing before: {missing_before})",
x="Estimated weather-related delayed flights (per record)",
y="Count of records"
) +
theme_minimal() +
theme(
axis_title=element_text(size=12),
plot_title=element_text(size=16),
plot_subtitle=element_text(size=12)
)
)
plot_weather_distConclusion (Task 4): This estimate is intentionally rule-based and traceable back to the BTS guidance. Mean-imputing Late-Arriving Aircraft prevents dropping records and keeps the derived weather estimate defined for every row used in airport comparisons. The result is a more realistic “weather impact” signal than the severe-only Weather column, because it captures downstream operational effects that weather creates even when the immediate cause is coded elsewhere.
Using the new weather estimate, create a barplot showing the proportion of all flights delayed by weather at each airport. Describe what you learn.
This focuses on traveler-facing risk rather than raw counts:
estimated weather-delayed flights / total flightsUsing a proportion makes airports comparable even when their total flight volumes differ substantially.
airport_weather = (
df.groupby("airport_code", as_index=False)
.agg(
airport_name=("airport_name", "first"),
total_flights=("num_of_flights_total", "sum"),
weather_delayed=("weather_delays_est", "sum")
)
)
airport_weather["prop_weather"] = airport_weather["weather_delayed"] / airport_weather["total_flights"]
order = (
airport_weather.sort_values("prop_weather", ascending=False)["airport_code"]
.astype(str).tolist()
)
airport_weather["airport"] = airport_weather["airport_code"].astype(str)
airport_weather["label"] = (airport_weather["prop_weather"] * 100).round(2).astype(str) + "%"
max_y = float(airport_weather["prop_weather"].max())
y_top = max_y * 1.25
plot_weather = (
ggplot(airport_weather, aes(x="airport", y="prop_weather")) +
geom_bar(stat="identity", width=0.75, alpha=0.9) +
geom_text(aes(label="label"), va="bottom", nudge_y=max_y*0.05, size=9) +
scale_x_discrete(limits=order) +
coord_cartesian(ylim=[0, y_top]) +
scale_y_continuous(format=".0%") +
labs(
title="Proportion of Flights Delayed by Weather (Estimated Severe + Mild)",
subtitle="Airport comparison on weather-delay risk (rates, not counts)",
x="Airport",
y="Proportion of flights delayed by weather"
) +
theme_minimal() +
theme(
axis_title=element_text(size=12),
plot_title=element_text(size=16),
plot_subtitle=element_text(size=12)
)
)
plot_weatherConclusion (Task 5): SFO has the highest weather-delay share at about 9.51%, while SLC is lowest at 4.11%. This is a useful operational distinction: a high-volume airport may generate many weather delays in raw counts, but a traveler experiences reliability through rate. The airports near the top of this chart are the places where weather volatility is most likely to disrupt schedules, so buffer time, earlier departures, or alternative routings are most justified.
Which delay is the worst delay? Repeat the proportion analysis for Weather (est.), Carrier, and Security. Compare in a chart and a table. Describe results.
Security delays are present, but they are typically orders of magnitude smaller than carrier or weather-driven delays. A linear axis hides that difference by visually flattening Security to “near zero.” To preserve a single-view comparison while keeping Security visible, the chart uses a log-scaled y-axis and an explicit lower bound.
airport_cat = (
df.groupby("airport_code", as_index=False)
.agg(
airport_name=("airport_name", "first"),
total_flights=("num_of_flights_total", "sum"),
weather=("weather_delays_est", "sum"),
carrier=("num_of_delays_carrier", "sum"),
security=("num_of_delays_security", "sum"),
)
)
airport_cat["weather_pct"] = (airport_cat["weather"] / airport_cat["total_flights"]) * 100
airport_cat["carrier_pct"] = (airport_cat["carrier"] / airport_cat["total_flights"]) * 100
airport_cat["security_pct"] = (airport_cat["security"] / airport_cat["total_flights"]) * 100
stretch_table = (
airport_cat[["airport_code", "airport_name", "total_flights", "weather_pct", "carrier_pct", "security_pct"]]
.assign(
weather_pct=lambda d: d["weather_pct"].round(3),
carrier_pct=lambda d: d["carrier_pct"].round(3),
security_pct=lambda d: d["security_pct"].round(3),
)
.sort_values("weather_pct", ascending=False)
)
stretch_table| airport_code | airport_name | total_flights | weather_pct | carrier_pct | security_pct | |
|---|---|---|---|---|---|---|
| 5 | SFO | San Francisco, CA: San Francisco International | 1630945 | 9.508 | 5.299 | 0.043 |
| 3 | ORD | Chicago, IL: Chicago O'Hare International | 3597588 | 8.428 | 4.072 | 0.024 |
| 0 | ATL | Atlanta, GA: Hartsfield-Jackson Atlanta Intern... | 4430047 | 6.799 | 3.907 | 0.019 |
| 2 | IAD | Washington, DC: Washington Dulles International | 851571 | 5.842 | 5.609 | 0.032 |
| 1 | DEN | Denver, CO: Denver International | 2513974 | 5.744 | 4.848 | 0.039 |
| 4 | SAN | San Diego, CA: San Diego International | 917862 | 5.088 | 6.224 | 0.053 |
| 6 | SLC | Salt Lake City, UT: Salt Lake City International | 1403384 | 4.114 | 4.634 | 0.062 |
long = airport_cat.melt(
id_vars=["airport_code", "airport_name", "total_flights"],
value_vars=["weather_pct", "carrier_pct", "security_pct"],
var_name="category",
value_name="pct"
)
label_map = {"weather_pct": "Weather (est.)", "carrier_pct": "Carrier", "security_pct": "Security"}
long["category"] = long["category"].map(label_map)
order = (
airport_cat.sort_values("weather_pct", ascending=False)["airport_code"]
.astype(str).tolist()
)
long["airport"] = long["airport_code"].astype(str)
min_pos = float(long.loc[long["pct"] > 0, "pct"].min())
lower = float(max(0.005, min_pos * 0.8))
upper = float(long["pct"].max()) * 1.35
long = long.copy()
long["lower"] = lower
plot_stretch = (
ggplot(long, aes(x="airport", y="pct", color="category")) +
geom_segment(
aes(x="airport", xend="airport", y="lower", yend="pct"),
position=position_dodge(width=0.65),
size=1.2, alpha=0.85
) +
geom_point(position=position_dodge(width=0.65), size=6, alpha=0.95) +
scale_y_log10() +
scale_x_discrete(limits=order) +
coord_cartesian(ylim=[lower, upper]) +
labs(
title="Delay Category Comparison by Airport",
subtitle="Percent of flights delayed (log scale to keep rare Security delays visible)",
x="Airport (ordered by weather-delay share)",
y="Percent of flights delayed (log scale)",
color="Delay category"
) +
theme_minimal() +
theme(
axis_text_x=element_text(angle=30, hjust=1),
axis_title=element_text(size=12),
plot_title=element_text(size=16),
plot_subtitle=element_text(size=12),
legend_title=element_text(size=11),
legend_text=element_text(size=10)
)
)
plot_stretchtotals = pd.DataFrame({
"category": ["Weather (est.)", "Carrier", "Security"],
"delayed_flights": [airport_cat["weather"].sum(), airport_cat["carrier"].sum(), airport_cat["security"].sum()],
"total_flights": [airport_cat["total_flights"].sum()] * 3
})
totals["prop"] = totals["delayed_flights"] / totals["total_flights"]
worst = totals.sort_values("prop", ascending=False).iloc[0]
least = totals.sort_values("prop", ascending=True).iloc[0]
security_total = int(airport_cat["security"].sum())Conclusion (Stretch): Across the full dataset, Weather (est.) is the largest contributor among these three (≈ 6.89% of flights), while Security is the smallest (≈ 0.033%). Security delays are still present (total security-delayed flights = 5006), but they are rare enough that a linear axis makes them appear to be zero. The log-scale view preserves the correct ordering of magnitude while keeping the comparison in a single, readable chart.