E. coli Knockout Growth Curves

Cluster size distribution — LB

Number of genes assigned to each growth phenotype cluster in LB medium.

Cluster size distribution — M63

Number of genes assigned to each growth phenotype cluster in M63 medium.

Phenotype shift LB → M63

Each cell shows how many genes move from a given LB cluster (row) to an M63 cluster (column). Diagonal = same phenotype in both media. ✕ = non-growing sentinel cluster.

Non-growing knockouts in M63 — putative auxotrophs

These — genes do not grow in M63 minimal medium but grow normally in LB. Their knockouts cannot synthesise an essential metabolite from the minimal carbon/nitrogen source, making them dependent on external supply (auxotrophs).

WCSS elbow plot

Within-cluster sum of squares (WCSS) from KinBiont's k-means sweep. The elbow point (★) was selected as the optimal number of clusters.

Interpretation

WCSS measures the total variance unexplained by the clusters. It always decreases as k increases, but the rate of improvement diminishes past the elbow — adding more clusters gives diminishing returns in explanatory power. The elbow is detected as the point of maximum curvature (largest second derivative) in the WCSS curve.

LB cluster profiles

Mean growth curve (OD) ± 1 SD for each cluster. Line width ∝ cluster size.

M63 cluster profiles

Mean growth curve (OD) ± 1 SD for each cluster. Line width ∝ cluster size.

Z-scored shape prototypes

Centroids normalised to zero mean / unit variance per curve — shape is independent of absolute OD magnitude. Non-growing clusters are excluded: z-scoring a flat signal either collapses to all-zeros (constant curves) or amplifies measurement noise, so their z-score carries no shape information.

Gene-level growth curves

Cluster centroids shown by default. Use the buttons to reveal all curves for a cluster, or search below and click genes to overlay individual curves (up to 10).

Show all curves:

Gene list — click to overlay

Gene	JW-ID	LB cluster	M63 cluster

LB vs M63 — per-gene overlay

Select a gene from the table below to compare its growth curve in both media side by side. Shaded area = ± 1 SD across replicates.

Select gene

Gene	JW-ID	LB cluster	M63 cluster	Δ cluster?

Analysis script — `analysis/analyse.jl`

Julia · reads XLSX data · aggregates replicates · runs KinBiont.jl k-means (WCSS elbow) · exports JSON

View on GitHub

#!/usr/bin/env julia
# analyse.jl — Growth curve analysis for E. coli Keio knockout dataset
#
# Setup (run once):
#   julia --project=. -e '
#     using Pkg
#     Pkg.develop(path="../../KinBiont.jl")
#     Pkg.instantiate()
#   '
#
# Run:
#   julia --project=. analyse.jl
#
# Output: ../docs/data/curves_data.json

using XLSX
using Statistics
using JSON3
using Kinbiont

const DATA_DIR = joinpath(@__DIR__, "..")
const OUT_PATH = joinpath(DATA_DIR, "docs", "data", "curves_data.json")

# ─────────────────────────────────────────────────────────────────────────────
# Helpers
# ─────────────────────────────────────────────────────────────────────────────

_as_float(v) = (ismissing(v) || v === nothing) ? NaN : Float64(v)
_as_str(v)   = (ismissing(v) || v === nothing) ? ""  : string(v)

function find_elbow(ks, wcss_vals)
    # Largest second-difference (maximum curvature in the elbow)
    length(ks) < 3 && return ks[end]
    d2 = diff(diff(wcss_vals))
    return ks[argmax(d2) + 1]
end

function raw_centroids(curves::Matrix{Float64}, labels::Vector{Int}, n_k::Int)
    n_tp = size(curves, 2)
    cents = zeros(n_k, n_tp)
    for k in 1:n_k
        idx = findall(==(k), labels)
        isempty(idx) && continue
        cents[k, :] = vec(mean(curves[idx, :], dims=1))
    end
    return cents
end

# ─────────────────────────────────────────────────────────────────────────────
# 1. Metadata: curve_id → (gene, jw_id, medium)
# ─────────────────────────────────────────────────────────────────────────────

function load_metadata()
    path = joinpath(DATA_DIR, "Curves_knockouts_media.xlsx")
    @info "Reading metadata from $path"
    xf = XLSX.readxlsx(path)
    sh = xf[1]
    data = sh[:]

    meta = Dict{String, @NamedTuple{gene::String, jw_id::String, medium::String}}()
    for i in 2:size(data, 1)
        row = data[i, :]
        any(j -> ismissing(row[j]), 1:5) && continue
        curve_id  = _as_str(row[1])
        jw_id     = _as_str(row[2])
        gene_name = _as_str(row[3])
        # col 4 = gene_category (unused here)
        medium    = _as_str(row[5])
        isempty(curve_id) && continue
        meta[curve_id] = (; gene=gene_name, jw_id, medium)
    end
    @info "  $(length(meta)) curve-to-gene mappings loaded"
    return meta
end

# ─────────────────────────────────────────────────────────────────────────────
# 2. Growth curves for one medium file
# ─────────────────────────────────────────────────────────────────────────────

function load_curves(path::String, sheet_name::String, wanted::Set{String})
    @info "Reading curves from $path (sheet=$sheet_name, want=$(length(wanted)) curves)"
    xf   = XLSX.readxlsx(path)
    sh   = xf[sheet_name]
    data = sh[:]
    nrows, ncols = size(data)

    # Row 1: column headers  (Time, Curve00001, …)
    headers = [_as_str(data[1, j]) for j in 1:ncols]

    # Collect valid time rows
    times     = Float64[]
    row_index = Int[]
    for i in 2:nrows
        t = _as_float(data[i, 1])
        isfinite(t) || continue
        push!(times, t)
        push!(row_index, i)
    end

    # Extract only curves in `wanted`
    curves = Dict{String, Vector{Float64}}()
    for j in 2:ncols
        hdr = headers[j]
        hdr in wanted || continue
        vals = [_as_float(data[row_index[k], j]) for k in eachindex(row_index)]
        # Replace NaN with 0 (blank baseline)
        replace!(vals, NaN => 0.0)
        curves[hdr] = vals
    end

    @info "  $(length(times)) time points, $(length(curves)) curves extracted"
    return times, curves
end

# ─────────────────────────────────────────────────────────────────────────────
# 3. Aggregate replicates → mean ± std per (gene, medium)
# ─────────────────────────────────────────────────────────────────────────────

function aggregate_by_gene(meta, curves_dict::Dict{String,Vector{Float64}})
    # curves_dict: curve_id → OD vector (all same medium)
    groups = Dict{String, Vector{Vector{Float64}}}()   # gene → list of replicate vectors
    jw_map = Dict{String, String}()

    for (curve_id, info) in meta
        haskey(curves_dict, curve_id) || continue
        gene = info.gene
        if !haskey(groups, gene)
            groups[gene]  = Vector{Float64}[]
            jw_map[gene]  = info.jw_id
        end
        push!(groups[gene], curves_dict[curve_id])
    end

    result = Dict{String, @NamedTuple{mean::Vector{Float64}, std::Vector{Float64},
                                       n::Int, jw_id::String}}()
    for (gene, replicates) in groups
        mat      = reduce(hcat, replicates)   # n_tp × n_replicates
        μ        = vec(Statistics.mean(mat, dims=2))
        σ        = size(mat, 2) > 1 ? vec(Statistics.std(mat, dims=2)) : zeros(length(μ))
        result[gene] = (; mean=μ, std=σ, n=size(mat, 2), jw_id=jw_map[gene])
    end
    return result
end

# ─────────────────────────────────────────────────────────────────────────────
# 4. KinBiont clustering with WCSS elbow sweep
# ─────────────────────────────────────────────────────────────────────────────

function cluster_gene_curves(
    gene_means::Matrix{Float64},   # n_genes × n_tp
    times::Vector{Float64},
    gene_labels::Vector{String};
    k_range = 2:10,
)
    gd = GrowthData(gene_means, times, gene_labels)

    # WCSS sweep
    @info "  Running WCSS sweep k=$(first(k_range))..$(last(k_range))"
    ks        = collect(k_range)
    wcss_vals = Float64[]
    for k in ks
        opts      = FitOptions(
            cluster                    = true,
            n_clusters                 = k,
            cluster_prescreen_constant = true,
            cluster_tol_const          = 1.5,
        )
        proc = preprocess(gd, opts)
        push!(wcss_vals, something(proc.wcss, 0.0))
    end

    # Elbow
    opt_k = find_elbow(ks, wcss_vals)
    @info "  Optimal k = $opt_k (elbow method)"

    # Final clustering
    opts_final = FitOptions(
        cluster                    = true,
        n_clusters                 = opt_k,
        cluster_prescreen_constant = true,
        cluster_tol_const          = 1.5,
    )
    proc_final = preprocess(gd, opts_final)

    return (
        ks            = ks,
        wcss          = wcss_vals,
        optimal_k     = opt_k,
        clusters      = something(proc_final.clusters, ones(Int, size(gene_means, 1))),
        centroids_z   = proc_final.centroids,          # z-scored shape prototypes
        centroids_raw = raw_centroids(gene_means,
                            something(proc_final.clusters,
                                      ones(Int, size(gene_means, 1))), opt_k),
    )
end

# ─────────────────────────────────────────────────────────────────────────────
# 5. Main
# ─────────────────────────────────────────────────────────────────────────────

function main()
    meta = load_metadata()

    ids_lb  = Set(k for (k, v) in meta if v.medium == "LB")
    ids_m63 = Set(k for (k, v) in meta if v.medium == "M63")
    meta_lb  = Dict(k => v for (k, v) in meta if v.medium == "LB")
    meta_m63 = Dict(k => v for (k, v) in meta if v.medium == "M63")

    times_lb,  curves_lb  = load_curves(joinpath(DATA_DIR, "Growth_curves_LB.xlsx"),  "LB",  ids_lb)
    times_m63, curves_m63 = load_curves(joinpath(DATA_DIR, "Growth_curves_M63.xlsx"), "M63", ids_m63)

    times = length(times_lb) <= length(times_m63) ? times_lb : times_m63
    n_tp  = length(times)
    foreach(k -> curves_lb[k]  = curves_lb[k][1:n_tp],  keys(curves_lb))
    foreach(k -> curves_m63[k] = curves_m63[k][1:n_tp], keys(curves_m63))

    @info "Aggregating replicates by gene..."
    agg_lb  = aggregate_by_gene(meta_lb,  curves_lb)
    agg_m63 = aggregate_by_gene(meta_m63, curves_m63)

    genes_lb_sorted  = sort(collect(keys(agg_lb)))
    genes_m63_sorted = sort(collect(keys(agg_m63)))
    all_genes         = sort(collect(Set(genes_lb_sorted) ∪ Set(genes_m63_sorted)))
    @info "  $(length(all_genes)) unique genes"

    mat_lb  = reduce(vcat, [agg_lb[g].mean'  for g in genes_lb_sorted])
    mat_m63 = reduce(vcat, [agg_m63[g].mean' for g in genes_m63_sorted])

    @info "Clustering LB gene curves..."
    cl_lb  = cluster_gene_curves(mat_lb,  times, genes_lb_sorted)
    @info "Clustering M63 gene curves..."
    cl_m63 = cluster_gene_curves(mat_m63, times, genes_m63_sorted)

    # Downsample for JSON (every 4th point: 200 → 50 time points)
    ds_idx   = 1:4:n_tp
    times_ds = times[ds_idx]
    ds_vec(v) = round.(v[ds_idx]; digits=6)

    lb_cluster_map  = Dict(zip(genes_lb_sorted,  cl_lb.clusters))
    m63_cluster_map = Dict(zip(genes_m63_sorted, cl_m63.clusters))

    jw_ids = Dict{String,String}()
    for (g, info) in agg_lb;  jw_ids[g] = info.jw_id; end
    for (g, info) in agg_m63; haskey(jw_ids, g) || (jw_ids[g] = info.jw_id); end

    gene_records = [
        let rec = Dict{String,Any}("gene" => gene, "jw_id" => get(jw_ids, gene, ""))
            haskey(agg_lb,  gene) && (rec["LB"]  = Dict("mean" => ds_vec(agg_lb[gene].mean),
                "std" => ds_vec(agg_lb[gene].std),  "n_replicates" => agg_lb[gene].n,
                "cluster" => lb_cluster_map[gene]))
            haskey(agg_m63, gene) && (rec["M63"] = Dict("mean" => ds_vec(agg_m63[gene].mean),
                "std" => ds_vec(agg_m63[gene].std), "n_replicates" => agg_m63[gene].n,
                "cluster" => m63_cluster_map[gene]))
            rec
        end
        for gene in all_genes
    ]

    out = Dict(
        "metadata"    => Dict("n_genes" => length(all_genes),
                               "n_timepoints" => length(times_ds),
                               "media" => ["LB", "M63"]),
        "times"       => round.(times_ds; digits=4),
        "wcss_sweep"  => Dict("LB"  => Dict("ks" => cl_lb.ks,
                                               "wcss" => round.(cl_lb.wcss; digits=4)),
                               "M63" => Dict("ks" => cl_m63.ks,
                                               "wcss" => round.(cl_m63.wcss; digits=4))),
        "optimal_k"   => Dict("LB" => cl_lb.optimal_k, "M63" => cl_m63.optimal_k),
        "centroids"   => Dict("LB"  => [ds_vec(cl_lb.centroids_raw[k,:])  for k in 1:cl_lb.optimal_k],
                               "M63" => [ds_vec(cl_m63.centroids_raw[k,:]) for k in 1:cl_m63.optimal_k]),
        "centroids_z" => Dict("LB"  => [ds_vec(cl_lb.centroids_z[k,:])  for k in 1:cl_lb.optimal_k],
                               "M63" => [ds_vec(cl_m63.centroids_z[k,:]) for k in 1:cl_m63.optimal_k]),
        "genes"       => gene_records,
    )

    mkpath(dirname(OUT_PATH))
    open(OUT_PATH, "w") do io; JSON3.write(io, out) end
    @info "Output written to $OUT_PATH"
end

main()