Data is available only upon formal request and subject to approval.
Approved users receive a secure institute account and work with the data exclusively in our Trusted Research Environment (TRE) via remote desktop.
Request data (Email to us)Workshop training application for the MIE 2026 TRACE session. The purpose of this application is to demonstrate the TRACE data access workflow using a synthetic longitudinal clinical dataset. Participants will use the provided data to run example Python/Jupyter analyses and visualize cohort composition, disease activity over time, patient-level trajectories, and laboratory markers.
record_idscreen_dateic_obtainedconsent_dateie_allbrthdtcsexethnicsmokingbmimhcaticdmhstdtcdisease_durationdisease_behavioureim_presentvisit_attendedvisit_datesymptomspgacurrent_flarecmtrtaeynaetermesr_mm_hhb_mg_dlfec_calproc_typeproc_datedscompldsreasclinical_remissionibd_surgerytrt_escalationtrt_discontinuation#!/usr/bin/env python3
"""
MIE 2026 Workshop: Fake IBD Study Plots
This script is a plain-Python version of the original notebook. It assumes there
is one CSV file only: the study data export. It does not require the REDCap data
dictionary because the relevant value labels are defined manually below.
The fake study is treated as a longitudinal inflammatory bowel disease cohort
with enrollment, demographic/diagnosis, baseline, visit 1, and visit 2 rows for
each participant.
Example:
python MIE2026_fake_IBD_workshop_plots_single_csv.py \
--data MIE2026Workshop_DATA_2026-05-18_1112.csv \
--output-dir plots
The script saves PNG plots and CSV summary tables to the output directory.
Use --show-plots if you also want the plots to open interactively.
"""
from __future__ import annotations
import argparse
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
DEFAULT_DATA_FILE = "MIE2026Workshop_DATA_2026-05-18_1112.csv"
# Make plots a little easier to read.
plt.rcParams["figure.figsize"] = (8, 5)
plt.rcParams["axes.grid"] = True
plt.rcParams["grid.alpha"] = 0.3
# -----------------------------------------------------------------------------
# Manually defined REDCap-style labels
# -----------------------------------------------------------------------------
EVENT_MAP = {
"enrollment_arm_1": "Enrollment",
"demographics_arm_1": "Demographics / diagnosis",
"baseline_visit_arm_1": "Baseline",
"visit_1_arm_1": "Visit 1",
"visit_2_arm_1": "Visit 2",
}
VISIT_ORDER = ["Baseline", "Visit 1", "Visit 2"]
SEX_MAP = {
1: "Female",
2: "Male",
}
ETHNICITY_MAP = {
1: "Caucasian",
2: "Asian",
3: "African",
4: "Other",
}
SMOKING_MAP = {
1: "Never smoker",
2: "Ex-smoker",
3: "Current smoker",
}
BMI_MAP = {
1: "<18.5",
2: "18.5-24.9",
3: "25-29.9",
4: ">30",
}
DIAGNOSIS_MAP = {
1: "Ulcerative colitis (K51)",
2: "Crohn's disease (K50)",
3: "Indeterminate colitis (K52.3)",
}
DISEASE_BEHAVIOUR_MAP = {
1: "Non-stricturing non-penetrating",
2: "Stricturing",
3: "Penetrating",
}
PGA_MAP = {
1: "Remission",
2: "Mild disease",
3: "Moderate disease",
4: "Severe disease",
}
PGA_ORDER = ["Remission", "Mild disease", "Moderate disease", "Severe disease"]
ESR_MAP = {
1: "<30 mm/h",
2: ">=30 mm/h",
}
HB_MAP = {
1: "<10.5 g/dL",
2: ">=10.5 g/dL",
}
FECAL_CALPROTECTIN_MAP = {
1: ">50 ug/mg",
2: "<50 ug/mg",
}
# -----------------------------------------------------------------------------
# Data loading and preparation
# -----------------------------------------------------------------------------
def parse_args() -> argparse.Namespace:
"""Parse command-line arguments."""
parser = argparse.ArgumentParser(
description="Create workshop plots from a single fake IBD study CSV export."
)
parser.add_argument(
"--data",
default=DEFAULT_DATA_FILE,
help=f"Path to the study CSV file. Default: {DEFAULT_DATA_FILE}",
)
parser.add_argument(
"--output-dir",
default="workshop_outputs",
help="Directory where plots and summary tables will be saved.",
)
parser.add_argument(
"--show-plots",
action="store_true",
help="Show plots interactively in addition to saving them.",
)
return parser.parse_args()
def resolve_data_path(data_file: str | Path) -> Path:
"""Find the CSV file locally or in /mnt/data when run in a sandbox."""
data_file = Path(data_file)
possible_paths = [data_file, Path("/mnt/data") / data_file.name]
for path in possible_paths:
if path.exists():
return path
raise FileNotFoundError(
f"Could not find {data_file}. Put the CSV in the same folder as this "
"script, or pass the correct path with --data."
)
def load_data(csv_path: Path) -> pd.DataFrame:
"""Load the study data CSV."""
df = pd.read_csv(csv_path)
print(f"Loaded: {csv_path}")
print(f"Rows: {df.shape[0]}")
print(f"Columns: {df.shape[1]}")
return df
def inspect_longitudinal_structure(df: pd.DataFrame) -> None:
"""Print a compact overview of the REDCap-style longitudinal structure."""
print("\nNumber of unique participants:", df["record_id"].nunique())
print("\nRows by REDCap event:")
print(df["redcap_event_name"].value_counts().sort_index().to_frame("n_rows"))
print("\nColumn names:")
print(list(df.columns))
def create_analysis_datasets(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
"""Create demographics and visit-level analysis datasets."""
demographics = df[df["redcap_event_name"] == "demographics_arm_1"].copy()
visit_events = ["baseline_visit_arm_1", "visit_1_arm_1", "visit_2_arm_1"]
visits = df[df["redcap_event_name"].isin(visit_events)].copy()
visits["visit"] = visits["redcap_event_name"].map(EVENT_MAP)
visits["visit"] = pd.Categorical(
visits["visit"], categories=VISIT_ORDER, ordered=True
)
visits["pga_code"] = pd.to_numeric(visits["pga"], errors="coerce")
visits["pga_label"] = visits["pga_code"].map(PGA_MAP)
visits["pga_label"] = pd.Categorical(
visits["pga_label"], categories=PGA_ORDER, ordered=True
)
demographics["sex_label"] = pd.to_numeric(
demographics["sex"], errors="coerce"
).map(SEX_MAP)
demographics["diagnosis_label"] = pd.to_numeric(
demographics["icd"], errors="coerce"
).map(DIAGNOSIS_MAP)
demographics["smoking_label"] = pd.to_numeric(
demographics["smoking"], errors="coerce"
).map(SMOKING_MAP)
demographics["bmi_label"] = pd.to_numeric(
demographics["bmi"], errors="coerce"
).map(BMI_MAP)
demographics["disease_behaviour_label"] = pd.to_numeric(
demographics["disease_behaviour"], errors="coerce"
).map(DISEASE_BEHAVIOUR_MAP)
print("\nDemographics shape:", demographics.shape)
print("Visits shape:", visits.shape)
return demographics, visits
# -----------------------------------------------------------------------------
# Plot helpers
# -----------------------------------------------------------------------------
def save_or_show(fig: plt.Figure, output_path: Path, show_plots: bool) -> None:
"""Save a figure and optionally display it interactively."""
fig.savefig(output_path, dpi=300, bbox_inches="tight")
print(f"Saved plot: {output_path}")
if show_plots:
plt.show()
else:
plt.close(fig)
def plot_diagnosis_by_sex(
demographics: pd.DataFrame, output_dir: Path, show_plots: bool
) -> pd.DataFrame:
"""Plot 1: cohort overview, diagnosis by sex."""
plot_data = pd.crosstab(
demographics["diagnosis_label"],
demographics["sex_label"],
)
diagnosis_order = [
"Ulcerative colitis (K51)",
"Crohn's disease (K50)",
"Indeterminate colitis (K52.3)",
]
plot_data = plot_data.reindex([x for x in diagnosis_order if x in plot_data.index])
ax = plot_data.plot(kind="bar", stacked=True)
ax.set_title("Cohort overview: diagnosis by sex")
ax.set_xlabel("Diagnosis")
ax.set_ylabel("Number of participants")
ax.legend(title="Sex")
plt.xticks(rotation=25, ha="right")
plt.tight_layout()
fig = ax.get_figure()
save_or_show(fig, output_dir / "plot1_diagnosis_by_sex.png", show_plots)
plot_data.to_csv(output_dir / "plot1_diagnosis_by_sex.csv")
return plot_data
def plot_disease_activity_distribution(
visits: pd.DataFrame, output_dir: Path, show_plots: bool
) -> pd.DataFrame:
"""Plot 2: 100% stacked bar chart of disease activity across visits."""
plot_counts = pd.crosstab(visits["visit"], visits["pga_label"])
plot_counts = plot_counts.reindex(index=VISIT_ORDER, columns=PGA_ORDER, fill_value=0)
plot_percent = plot_counts.div(plot_counts.sum(axis=1), axis=0) * 100
ax = plot_percent.plot(kind="bar", stacked=True)
ax.set_title("Disease activity distribution across visits")
ax.set_xlabel("Visit")
ax.set_ylabel("Participants (%)")
ax.legend(
title="Physician global assessment",
bbox_to_anchor=(1.02, 1),
loc="upper left",
)
plt.xticks(rotation=0)
plt.tight_layout()
fig = ax.get_figure()
save_or_show(fig, output_dir / "plot2_disease_activity_distribution.png", show_plots)
plot_percent.round(1).to_csv(output_dir / "plot2_disease_activity_distribution_percent.csv")
return plot_percent
def plot_patient_trajectories(
visits: pd.DataFrame, output_dir: Path, show_plots: bool
) -> pd.DataFrame:
"""Plot 3: heatmap of patient-level disease activity trajectories."""
trajectory = visits.pivot_table(
index="record_id",
columns="visit",
values="pga_code",
aggfunc="first",
).reindex(columns=VISIT_ORDER)
trajectory_sorted = trajectory.sort_values(by=VISIT_ORDER)
fig, ax = plt.subplots(figsize=(7, 10))
image = ax.imshow(trajectory_sorted, aspect="auto", vmin=1, vmax=4)
ax.set_title("Patient-level disease activity trajectories")
ax.set_xlabel("Visit")
ax.set_ylabel("Participant")
ax.set_xticks(np.arange(len(VISIT_ORDER)))
ax.set_xticklabels(VISIT_ORDER)
# Show only every 10th participant label to avoid clutter.
step = 10
ax.set_yticks(np.arange(0, len(trajectory_sorted), step))
ax.set_yticklabels(trajectory_sorted.index[::step])
colorbar = fig.colorbar(image, ax=ax, ticks=[1, 2, 3, 4])
colorbar.ax.set_yticklabels([PGA_MAP[i] for i in [1, 2, 3, 4]])
colorbar.set_label("Physician global assessment")
plt.tight_layout()
save_or_show(fig, output_dir / "plot3_patient_trajectories.png", show_plots)
trajectory_sorted.to_csv(output_dir / "plot3_patient_trajectories.csv")
return trajectory_sorted
def plot_abnormal_labs_by_activity(
visits: pd.DataFrame, output_dir: Path, show_plots: bool
) -> pd.DataFrame:
"""Plot 4: abnormal lab markers by clinical disease activity."""
lab_visits = visits.copy()
# In this fake dataset, the lab variables are coded categories:
# esr_mm_h: 1 = <30 mm/h, 2 = >=30 mm/h
# hb_mg_dl: 1 = <10.5 g/dL, 2 = >=10.5 g/dL
# fec_cal: 1 = >50 ug/mg, 2 = <50 ug/mg
lab_visits["ESR >=30 mm/h"] = pd.to_numeric(
lab_visits["esr_mm_h"], errors="coerce"
).eq(2)
lab_visits["Hemoglobin <10.5 g/dL"] = pd.to_numeric(
lab_visits["hb_mg_dl"], errors="coerce"
).eq(1)
lab_visits["Fecal calprotectin >50 ug/mg"] = pd.to_numeric(
lab_visits["fec_cal"], errors="coerce"
).eq(1)
lab_cols = [
"ESR >=30 mm/h",
"Hemoglobin <10.5 g/dL",
"Fecal calprotectin >50 ug/mg",
]
plot_data = (
lab_visits.dropna(subset=["pga_label"])
.groupby("pga_label", observed=False)[lab_cols]
.mean()
.mul(100)
.reindex(PGA_ORDER)
)
ax = plot_data.plot(kind="bar")
ax.set_title("Abnormal lab markers by clinical disease activity")
ax.set_xlabel("Physician global assessment")
ax.set_ylabel("Visits with abnormal marker (%)")
ax.legend(title="Lab marker", bbox_to_anchor=(1.02, 1), loc="upper left")
plt.xticks(rotation=25, ha="right")
plt.tight_layout()
fig = ax.get_figure()
save_or_show(fig, output_dir / "plot4_abnormal_labs_by_activity.png", show_plots)
plot_data.round(1).to_csv(output_dir / "plot4_abnormal_labs_by_activity_percent.csv")
return plot_data
# -----------------------------------------------------------------------------
# Main workflow
# -----------------------------------------------------------------------------
def main() -> None:
args = parse_args()
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
csv_path = resolve_data_path(args.data)
df = load_data(csv_path)
inspect_longitudinal_structure(df)
demographics, visits = create_analysis_datasets(df)
print("\nCreating plots and summary tables...")
plot1 = plot_diagnosis_by_sex(demographics, output_dir, args.show_plots)
plot2 = plot_disease_activity_distribution(visits, output_dir, args.show_plots)
plot3 = plot_patient_trajectories(visits, output_dir, args.show_plots)
plot4 = plot_abnormal_labs_by_activity(visits, output_dir, args.show_plots)
print("\nPreview of summary tables:")
print("\nPlot 1: diagnosis by sex")
print(plot1)
print("\nPlot 2: disease activity distribution, percent")
print(plot2.round(1))
print("\nPlot 3: patient trajectories, first rows")
print(plot3.head())
print("\nPlot 4: abnormal labs by activity, percent")
print(plot4.round(1))
print("\nInterpretation questions for workshop participants:")
print("1. Which diagnosis is most common in this fake cohort?")
print("2. Does disease activity appear to improve or worsen over time?")
print("3. Do individual patient trajectories tell a different story from the group-level stacked bar chart?")
print("4. Do abnormal lab markers align with clinical disease activity?")
if __name__ == "__main__":
main()
| Version | Language | Type | Relation | Author | Date |
|---|---|---|---|---|---|
| Global v1 (Python v1) selected | Python | Single Script | Initial Implementation | trace20 | 2026-05-26 |
| Global v2 (Python v2) | Python | Multi-file Archive | Refinement/Bug Fix ← Global v1 | mmueller | 2026-05-26 |
| Global v3 (Python v3) | Python | Multi-file Archive | Refinement/Bug Fix ← Global v2 | loki | 2026-05-26 |