Source code for cardsort.analysis

import logging
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.cluster import hierarchy
from scipy.spatial.distance import squareform
from typing import List, Union

__all__ = [
    "get_distance_matrix",
    "create_dendrogram",
    "get_cluster_labels",
    "get_cluster_labels_df",
]


logger = logging.getLogger(__name__)


def _check_data(df: pd.DataFrame) -> bool:
    """
    Check if input data is in the correct format.

    Parameters
    ----------
    df : pandas.DataFrame
        Columns:
            Name: card_id, dtype: int64
            Name: card_label, dtype: object
            Name: category_id, dtype: int64
            Name: category_label, dtype: object
            Name: user_id, dtype: int64
        These columns correspond to the 'Casolysis Data (.csv) - Recommended' export from kardsort.com.

    Returns
    -------
    out : bool
        True if the input data is in the correct format, False otherwise.
    """
    # check if first user_id is 1
    if df["user_id"].unique()[0] != 1:
        logger.error("First user_id does not equal 1.")
        return False
    # check if each card_id is always associated with exactly one card_label
    card_id_counts = df.groupby("card_id")["card_label"].nunique()
    for card_id, count in card_id_counts.items():
        if count != 1:
            logger.error(
                f"Card_id {card_id} is associated with {count} different card_labels."
            )
            return False
    # check if all users categorize each card exactly once
    counts = df.groupby(["user_id", "card_id"]).size()
    if (counts > 1).any():
        logger.error("At least one user categorized at least one card more than once.")
        return False
    n_cards = df["card_id"].nunique()
    n_users = df["user_id"].nunique()
    expected_size = n_cards * n_users
    if len(counts) != expected_size:
        logger.error("At least one user does not categorized at least one card.")
        return False
    return True


def _get_distance_matrix_for_user(df_user: pd.DataFrame) -> np.ndarray:
    """
    Return distance matrix for an individual user.

    Parameters
    ----------
    df : pandas.DataFrame (subset for a single user_id)
        Columns:
            Name: card_id, dtype: int64
            Name: card_label, dtype: object
            Name: category_id, dtype: int64
            Name: category_label, dtype: object
            Name: user_id, dtype: int64
        These columns correspond to the 'Casolysis Data (.csv) - Recommended' export from kardsort.com.

    Returns
    -------
    out : np.ndarray
        A distance matrix representing the pairwise similarity of all cards for an individual user (1 if
        they put two cards together, 0 otherwise).
    """
    df_user = df_user.sort_values("card_id")
    arr = df_user["category_label"].values
    X = (arr != arr[:, None]).astype(float)
    return X



[docs]
def get_distance_matrix(df: pd.DataFrame) -> np.ndarray:
    """
    Return condensed distance matrix from kardsort data.

    Parameters
    ----------
    df : pandas.DataFrame
        Columns:
            Name: card_id, dtype: int64
            Name: card_label, dtype: object
            Name: category_id, dtype: int64
            Name: category_label, dtype: object
            Name: user_id, dtype: int64
        These columns correspond to the 'Casolysis Data (.csv) - Recommended' export from kardsort.com.

    Returns
    -------
    out : ndarray
        A condensed distance matrix (a flat array containing the upper triangle of a distance matrix)
        representing the pairwise similarity of all cards.
    """
    if _check_data(df) == False:
        logger.error(
            "The DataFrame does not correspond to the required format. No distance matrix generated."
        )
        return None
    else:
        user_ids = df["user_id"].unique()
        for id_ in user_ids:
            df_u = df.loc[df["user_id"] == id_]
            logger.info(f"Computing distance matrix for user {id_}")
            distance_matrix_user = _get_distance_matrix_for_user(df_u)
            if id_ == 1:
                distance_matrix_all = distance_matrix_user
            else:
                distance_matrix_all = np.add(distance_matrix_all, distance_matrix_user)
        condensed_distance_matrix = squareform(distance_matrix_all)
        return condensed_distance_matrix




[docs]
def create_dendrogram(
    df, distance_matrix=None, count="fraction", linkage="average", color_threshold=None
) -> None:
    """
    Plot hierarchical clustering of kardsort data as dendrogram.

    Parameters
    ----------
    df : pandas.DataFrame
        Columns:
            Name: card_id, dtype: int64
            Name: card_label, dtype: object
            Name: category_id, dtype: int64
            Name: category_label, dtype: object
            Name: user_id, dtype: int64
        These columns correspond to the 'Casolysis Data (.csv) - Recommended' export from kardsort.com.
        The dataframe is used to extract leaf labels, and, if no distance_matrix provided, to calculate the distance matrix.

    distance_matrix : ndarray, optional
        Takes a condensed distance matrix as input: A flat array containing the upper triangular of the distance matrix.
        A pre-calculated condensed distance matrix can be provided to save time generating the dendrogram.
        If not specified, a new distance matrix will be calculated from df.

    count : str, optional
        How similarity is displayed.

        'fraction'
        Similarity is displayed as a fraction between 0 and 1.

        'absolute'
        Similarity is displayed as absolute counts from 0 to n = number of users.

    linkage : str, optional
        Linkage method used to compute the distance between two clusters.

        'average'
        Unweighted average distance between all elements in the clusters (UPGMA).

        'complete'
        Distance between the elements that are the farthest away from each other in the two clusters.

        'single'
        Distance between the elements that are the closest each other in the two clusters.

    color_threshold : double, optional
        Level below which to cut the color threshold in the dendrogram branches.
        Can be a fraction (0 - 1) or an absolute value (<= n = number of users).
        The default cut is at 75%.
    """
    if _check_data(df) == False:
        logger.error(
            "The DataFrame does not correspond to the required format. No dendrogram generated."
        )
        return None
    else:
        if distance_matrix is None:
            distance_matrix = get_distance_matrix(df)

        count_types = ["absolute", "fraction"]
        if count not in count_types:
            raise ValueError("Invalid count type. Expected one of: %s" % count_types)

        linkage_types = ["average", "complete", "single"]
        if linkage not in linkage_types:
            raise ValueError("Invalid linkage. Expected one of: %s" % linkage_types)

        if count == "fraction":
            distance_matrix = distance_matrix / np.max(distance_matrix)
            color_threshold = 0.75 if color_threshold is None else color_threshold
        else:
            color_threshold = (
                np.max(distance_matrix) * 0.75
                if color_threshold is None
                else color_threshold
            )

        Z = hierarchy.linkage(distance_matrix, linkage)
        plt.figure(layout="constrained")
        labels = (
            df.loc[df["user_id"] == 1]
            .sort_values("card_id")["card_label"]
            .squeeze()
            .to_list()
        )
        dn = hierarchy.dendrogram(
            Z, labels=labels, orientation="right", color_threshold=color_threshold
        )

        x_max = np.max(distance_matrix)
        plt.xticks(
            np.arange(0.0, 1.1, 0.1) if x_max <= 1 else np.arange(0, x_max + 1, 1)
        )

        for leaf, leaf_color in zip(
            plt.gca().get_yticklabels(), dn["leaves_color_list"]
        ):
            leaf.set_color(leaf_color)
        plt.show()



def _get_cluster_label_for_user(
    df_u: pd.DataFrame, cluster_cards: List[str]
) -> Union[str, None]:
    """
    Return labels an individual user created for clusters including a given list of cards.

    Parameters
    ----------
    df : pandas.DataFrame (subset for a single user_id)
        Columns:
                Name: card_id, dtype: int64
                Name: card_label, dtype: object
                Name: category_id, dtype: int64
                Name: category_label, dtype: object
                Name: user_id, dtype: int64
        These columns correspond to the 'Casolysis Data (.csv) - Recommended' export from kardsort.com.

    cluster_cards : list of str
        List of card-labels for which you would like to get user-generated cluster-labels.

    Returns
    -------
    out : str
        Category_label for the list of card_labels provided (if all cards have the same label).
    OR
    out : None
        If the cards in the list provided do not have the same card_label.
    """
    list_cat = df_u.loc[
        df_u["card_label"].isin(cluster_cards), "category_label"
    ].unique()
    if len(list_cat) == 1:
        return list_cat.squeeze().tolist()
    else:
        return None


def _get_cards_for_label(cluster_label: str, df_u: pd.DataFrame) -> List[str]:
    """
    Return list of all cards with a given cluster label for an individual user.

    Parameters
    ----------
    cluster_label : str
        A category label
    df_u : pandas.DataFrame (subset for an individual user_id)
        Columns:
                Name: card_id, dtype: int64
                Name: card_label, dtype: object
                Name: category_id, dtype: int64
                Name: category_label, dtype: object
                Name: user_id, dtype: int64
        These columns correspond to the 'Casolysis Data (.csv) - Recommended' export from kardsort.com.

    Returns
    -------
    out : List of str
        List including all card_labels that have the given category_label
    """
    cards_list = df_u.loc[
        df_u["category_label"] == cluster_label, "card_label"
    ].tolist()
    return cards_list



[docs]
def get_cluster_labels(
    df: pd.DataFrame,
    cluster_cards: List[str],
    print_results: bool = True,
    return_df_results: bool = True,
) -> Union[pd.DataFrame, None]:
    """
    Return labels users created for clusters including a given list of cards.

    Parameters
    ----------
    df : pandas.DataFrame
        Columns:
                Name: card_id, dtype: int64
                Name: card_label, dtype: object
                Name: category_id, dtype: int64
                Name: category_label, dtype: object
                Name: user_id, dtype: int64
        These columns correspond to the 'Casolysis Data (.csv) - Recommended' export from kardsort.com.

    cluster_cards : list of str
        List of card-labels for which you would like to get user-generated cluster-labels.

    print_results : bool, optional
        If true, prints which users grouped cards together and under which label

    return_df_results: bool, optional
       If true, returns a dataframe with results

    Returns
    -------
    out : pandas.DataFrame (default)
        Columns:
            Name: user_id, int
            Name: cluster_label, str
            Name: cards, list of str
        Dataframe with one row for each user who clustered the given cards together, including category label and
        the full list of cards in that category.
    OR
    out : None
        If return_df_results = False
    """
    if _check_data(df) == False:
        logger.error(
            "The data does not correspond to the required format. No cluster labels extracted."
        )
        return None
    else:
        if not set(cluster_cards) <= set(df["card_label"]):
            missing_card_labels = set(cluster_cards) - set(df["card_label"])
            logger.info(
                f'"{missing_card_labels}" is/are not a valid card label. Removed from list.'
            )
            cluster_cards = [
                card_label
                for card_label in cluster_cards
                if card_label not in missing_card_labels
            ]
            if len(cluster_cards) > 0:
                logger.info("Continue with cards: %s" % cluster_cards)
            else:
                logger.info("No cards left in list.")
                return None

        user_ids = df["user_id"].unique()

        cluster_list = []
        for id_ in user_ids:
            df_u = df.loc[df["user_id"] == id_]
            cluster_label = _get_cluster_label_for_user(df_u, cluster_cards)
            if cluster_label is not None:
                if print_results:
                    logger.info(f"User {id_} labeled card(s): {cluster_label}")
                if return_df_results:
                    cards = _get_cards_for_label(cluster_label, df_u)
                    cluster_list.append(
                        {"user_id": id_, "cluster_label": cluster_label, "cards": cards}
                    )
            else:
                if print_results:
                    logger.info(f"User {id_} did not cluster cards together.")

        if return_df_results:
            cluster_df = pd.DataFrame(cluster_list)
            return cluster_df