Module `toplot.hidden`

Visualize point estimate or average of hidden units/topic proportions.

This module provides functions to visualize how topics are distributed across records in your dataset. The input data should be organized as a DataFrame where: - Each row represents one example/record - Each column represents one topic - Values are proportions that sum to 1.0 across topics (columns)

Example DataFrame structure:

         Topic_1  Topic_2  Topic_3
Alice    0.7     0.2      0.1
Bob      0.3     0.6      0.1
...

Functions

def plot_cohort(hidden, sort: Literal['travelling-salesman'] | None = 'travelling-salesman', ax=None)

Expand source code

def plot_cohort(
    hidden, sort: Literal["travelling-salesman"] | None = "travelling-salesman", ax=None
):
    """Plot the average topic proportions for a set of records/participants/examples.

    N.B., requires the posterior average or a point estimate of the hidden units/topic
    proportions.

    Args:
        hidden: For each record/participant (rows), the proportion per topic
            (columns).
        sort: Whether to sort the participants according to the shortest Jensen-Shannon
            distance path ("travelling-salesman"), or no sorting (None).
        ax: Matplotlib axes to plot on.

    Returns:
        Reference to matplotlib axes.

    Example:
        ```python
        from pandas import DataFrame
        from numpy.random import dirichlet

        hidden = DataFrame(dirichlet([0.6, 0.8, 0.2], size=30), columns=['A', 'B', 'C'])
        plot_cohort(hidden)
        ```
    """
    if ax is None:
        ax = plt.gca()

    n_topics = hidden.shape[-1]

    topic_labels = [str(k + 1) for k in range(n_topics)]
    if isinstance(hidden, pd.DataFrame):
        topic_labels = hidden.columns
        hidden = hidden.to_numpy()

    if sort == "travelling-salesman":
        hidden_sorted = _sorted_topic_proportions(hidden)
    elif sort is None:
        hidden_sorted = hidden
    else:
        raise ValueError(f"Unknown sort method: {sort}")

    # Compute offsets for stacking bars.
    p_cum = np.cumsum(hidden_sorted, axis=1)
    p_cum_zero_padded = np.pad(
        p_cum, ((0, 0), (1, 0)), mode="constant", constant_values=0
    )
    x = np.arange(len(hidden_sorted)) + 1
    dx = x[1] - x[0]
    bar_width = dx

    ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
    ax.set_ylabel("Topic proportion")
    ax.set_ylim(0, 1)

    for k in range(n_topics):
        ax.bar(
            x,
            hidden_sorted[:, k],
            width=bar_width,
            label=topic_labels[k],
            bottom=p_cum_zero_padded[:, k],
            linewidth=0.0,
        )
    ax.legend(title="Topic:")
    return ax

Plot the average topic proportions for a set of records/participants/examples.

N.B., requires the posterior average or a point estimate of the hidden units/topic proportions.

Args

hidden: For each record/participant (rows), the proportion per topic (columns).
sort: Whether to sort the participants according to the shortest Jensen-Shannon distance path ("travelling-salesman"), or no sorting (None).
ax: Matplotlib axes to plot on.

Returns

Reference to matplotlib axes.

Example

from pandas import DataFrame
from numpy.random import dirichlet

hidden = DataFrame(dirichlet([0.6, 0.8, 0.2], size=30), columns=['A', 'B', 'C'])
plot_cohort(hidden)

def plot_polar_cohort(hidden, ax=None, sort: Literal['travelling-salesman'] | None = 'travelling-salesman')

Expand source code

def plot_polar_cohort(
    hidden, ax=None, sort: Literal["travelling-salesman"] | None = "travelling-salesman"
):
    """Make radial bar plots for a set of records/participants/examples.

    Args:
        hidden: For each record (rows), the proportion per topic (columns).
        sort: Whether to sort the records according to the shortest Jensen-Shannon
            distance path ("travelling-salesman"), or no sorting (None).
        ax: Matplotlib axes to plot on.

    Returns:
        Reference to matplotlib axes.

    Example:
        ```python
        from pandas import DataFrame
        from numpy.random import dirichlet

        hidden = DataFrame(dirichlet([0.6, 0.8, 0.2], size=30), columns=['A', 'B', 'C'])
        plot_polar_cohort(hidden)
        ```
    """
    if ax is None:
        ax = plt.gcf().add_subplot(projection="polar")

    m_samples, n_topics = hidden.shape
    topic_labels = [str(k + 1) for k in range(n_topics)]
    if isinstance(hidden, pd.DataFrame):
        topic_labels = hidden.columns
        hidden = hidden.to_numpy()

    if sort == "travelling-salesman":
        hidden_sorted = _sorted_topic_proportions(hidden)
    elif sort is None:
        hidden_sorted = hidden
    else:
        raise ValueError(f"Unknown sort method: {sort}")

    p_cum = np.cumsum(hidden_sorted, axis=1)
    p_cum_zero_padded = np.pad(
        p_cum, ((0, 0), (1, 0)), mode="constant", constant_values=0
    )

    x = np.linspace(0, 2 * np.pi, m_samples, endpoint=False)
    dx = x[1] - x[0]
    bar_width = dx
    ax.set_rlim(0, 1.0)
    ax.set_rorigin(-0.5)

    for k in range(n_topics):
        ax.bar(
            x,
            hidden_sorted[:, k],
            width=bar_width,
            label=topic_labels[k],
            bottom=p_cum_zero_padded[:, k],
        )

    ax.legend(loc="lower left", frameon=False, bbox_to_anchor=(1.0, 0), title="Topic:")
    ax.axis("off")
    return ax

Make radial bar plots for a set of records/participants/examples.

Args

hidden: For each record (rows), the proportion per topic (columns).
sort: Whether to sort the records according to the shortest Jensen-Shannon distance path ("travelling-salesman"), or no sorting (None).
ax: Matplotlib axes to plot on.

Returns

Reference to matplotlib axes.

Example

from pandas import DataFrame
from numpy.random import dirichlet

hidden = DataFrame(dirichlet([0.6, 0.8, 0.2], size=30), columns=['A', 'B', 'C'])
plot_polar_cohort(hidden)