Skip to content

Analyze

Activation analysis tools for measuring entropy, sparsity, and hidden state statistics.

ActivationAnalyzer

ActivationAnalyzer

Analyze model activations layer by layer.

Uses hooks to capture hidden states during inference and compute statistics about activation patterns.

Source code in src/model_garage/analyze/activations.py
class ActivationAnalyzer:
    """
    Analyze model activations layer by layer.

    Uses hooks to capture hidden states during inference and compute
    statistics about activation patterns.
    """

    def __init__(self, model: nn.Module, device: Optional[str] = None):
        self.model = model
        self.device = device or str(next(model.parameters()).device)
        self.hook_manager = HookManager(model)
        self._results = {}

    def analyze_layer(
        self,
        layer_name: str,
        input_ids: torch.Tensor,
    ) -> Dict[str, Any]:
        """
        Analyze activations at a specific layer.

        Args:
            layer_name: Layer to analyze (e.g., "model.layers.12")
            input_ids: Input token IDs

        Returns:
            Dict with activation statistics
        """
        self.hook_manager.register_capture_hook(layer_name, hook_name=layer_name)

        with torch.no_grad():
            self.model(input_ids.to(self.device))

        data = self.hook_manager.get_captured(layer_name)
        self.hook_manager.remove_hook(layer_name)

        if data is None or "output" not in data:
            return {"error": f"No data captured for {layer_name}"}

        stats = TensorUtils.stats(data["output"])
        stats["layer"] = layer_name
        return stats

    def analyze_all_layers(
        self,
        layer_names: list,
        input_ids: torch.Tensor,
    ) -> Dict[str, Dict[str, Any]]:
        """
        Analyze activations across multiple layers.

        Args:
            layer_names: List of layer names
            input_ids: Input token IDs

        Returns:
            Dict mapping layer names to their stats
        """
        for ln in layer_names:
            self.hook_manager.register_capture_hook(ln, hook_name=ln)

        with torch.no_grad():
            self.model(input_ids.to(self.device))

        results = {}
        for ln in layer_names:
            data = self.hook_manager.get_captured(ln)
            if data and "output" in data:
                results[ln] = TensorUtils.stats(data["output"])
                results[ln]["layer"] = ln

        self.hook_manager.remove_all()
        self._results = results
        return results

    def compute_entropy(self, layer_name: str, input_ids: torch.Tensor) -> float:
        """Compute entropy of activation distribution at a layer."""
        self.hook_manager.register_capture_hook(layer_name, hook_name=layer_name)

        with torch.no_grad():
            self.model(input_ids.to(self.device))

        data = self.hook_manager.get_captured(layer_name)
        self.hook_manager.remove_hook(layer_name)

        if data is None or "output" not in data:
            return float("nan")

        output = data["output"].float()
        probs = torch.softmax(output, dim=-1)
        entropy = -(probs * torch.log(probs + 1e-10)).sum(dim=-1).mean().item()
        return entropy

    @property
    def results(self) -> Dict[str, Dict[str, Any]]:
        """Get last analysis results."""
        return self._results

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.hook_manager.remove_all()

results property

results

Get last analysis results.

analyze_layer

analyze_layer(layer_name, input_ids)

Analyze activations at a specific layer.

Parameters:

Name Type Description Default
layer_name str

Layer to analyze (e.g., "model.layers.12")

required
input_ids Tensor

Input token IDs

required

Returns:

Type Description
Dict[str, Any]

Dict with activation statistics

Source code in src/model_garage/analyze/activations.py
def analyze_layer(
    self,
    layer_name: str,
    input_ids: torch.Tensor,
) -> Dict[str, Any]:
    """
    Analyze activations at a specific layer.

    Args:
        layer_name: Layer to analyze (e.g., "model.layers.12")
        input_ids: Input token IDs

    Returns:
        Dict with activation statistics
    """
    self.hook_manager.register_capture_hook(layer_name, hook_name=layer_name)

    with torch.no_grad():
        self.model(input_ids.to(self.device))

    data = self.hook_manager.get_captured(layer_name)
    self.hook_manager.remove_hook(layer_name)

    if data is None or "output" not in data:
        return {"error": f"No data captured for {layer_name}"}

    stats = TensorUtils.stats(data["output"])
    stats["layer"] = layer_name
    return stats

analyze_all_layers

analyze_all_layers(layer_names, input_ids)

Analyze activations across multiple layers.

Parameters:

Name Type Description Default
layer_names list

List of layer names

required
input_ids Tensor

Input token IDs

required

Returns:

Type Description
Dict[str, Dict[str, Any]]

Dict mapping layer names to their stats

Source code in src/model_garage/analyze/activations.py
def analyze_all_layers(
    self,
    layer_names: list,
    input_ids: torch.Tensor,
) -> Dict[str, Dict[str, Any]]:
    """
    Analyze activations across multiple layers.

    Args:
        layer_names: List of layer names
        input_ids: Input token IDs

    Returns:
        Dict mapping layer names to their stats
    """
    for ln in layer_names:
        self.hook_manager.register_capture_hook(ln, hook_name=ln)

    with torch.no_grad():
        self.model(input_ids.to(self.device))

    results = {}
    for ln in layer_names:
        data = self.hook_manager.get_captured(ln)
        if data and "output" in data:
            results[ln] = TensorUtils.stats(data["output"])
            results[ln]["layer"] = ln

    self.hook_manager.remove_all()
    self._results = results
    return results

compute_entropy

compute_entropy(layer_name, input_ids)

Compute entropy of activation distribution at a layer.

Source code in src/model_garage/analyze/activations.py
def compute_entropy(self, layer_name: str, input_ids: torch.Tensor) -> float:
    """Compute entropy of activation distribution at a layer."""
    self.hook_manager.register_capture_hook(layer_name, hook_name=layer_name)

    with torch.no_grad():
        self.model(input_ids.to(self.device))

    data = self.hook_manager.get_captured(layer_name)
    self.hook_manager.remove_hook(layer_name)

    if data is None or "output" not in data:
        return float("nan")

    output = data["output"].float()
    probs = torch.softmax(output, dim=-1)
    entropy = -(probs * torch.log(probs + 1e-10)).sum(dim=-1).mean().item()
    return entropy

BaseAnalyzer

BaseAnalyzer

Bases: ABC

Base class for model analyzers.

Defines the interface for analyzing neural pathways to understand which components contribute most to model behavior.

Source code in src/model_garage/analyze/base.py
class BaseAnalyzer(ABC):
    """
    Base class for model analyzers.

    Defines the interface for analyzing neural pathways to understand
    which components contribute most to model behavior.
    """

    def __init__(self, model: Any, framework: str = "pytorch"):
        self.model = model
        self.framework = framework
        self.activation_hooks = {}
        self.results = {}

    @abstractmethod
    def register_hooks(self) -> None:
        """Register hooks to capture activations during forward pass."""
        pass

    @abstractmethod
    def analyze_activations(self, inputs: Any, labels: Any) -> Dict[str, Any]:
        """Analyze activations in response to specific inputs."""
        pass

    @abstractmethod
    def identify_important_neurons(self, threshold: float = 0.5) -> Dict[str, Any]:
        """Identify neurons that strongly correlate with target behavior."""
        pass

    def calculate_correlation(
        self,
        activations: List[Dict[str, Any]],
        outcomes: List[float],
    ) -> Dict[str, List[Tuple[int, float]]]:
        """
        Calculate correlation between neuron activations and outcomes.

        Args:
            activations: List of activation maps per sample
            outcomes: List of outcome values

        Returns:
            Dict mapping layers to (neuron_idx, correlation) tuples
        """
        outcomes_array = np.array(outcomes)
        correlations = {}

        for layer_name in activations[0]:
            layer_acts = []
            for sample_idx in range(len(activations)):
                if layer_name in activations[sample_idx]:
                    layer_acts.append(activations[sample_idx][layer_name].flatten())

            if not layer_acts:
                continue

            layer_acts_array = np.array(layer_acts)
            layer_correlations = []

            for neuron_idx in range(layer_acts_array.shape[1]):
                neuron_acts = layer_acts_array[:, neuron_idx]
                try:
                    corr = np.corrcoef(neuron_acts, outcomes_array)[0, 1]
                    layer_correlations.append((neuron_idx, corr))
                except Exception as e:
                    logger.warning(
                        f"Correlation failed for neuron {neuron_idx} in {layer_name}: {e}"
                    )

            layer_correlations.sort(key=lambda x: abs(x[1]), reverse=True)
            correlations[layer_name] = layer_correlations

        return correlations

    def get_top_neurons(
        self,
        correlations: Dict[str, List[Tuple[int, float]]],
        top_n: int = 10,
    ) -> Dict[str, List[Tuple[int, float]]]:
        """Get the top neurons by correlation magnitude."""
        return {
            layer: sorted(corrs, key=lambda x: abs(x[1]), reverse=True)[:top_n]
            for layer, corrs in correlations.items()
        }

    def save_results(self, output_path: str) -> str:
        """Save analysis results to disk."""
        import json
        import os

        os.makedirs(os.path.dirname(output_path), exist_ok=True)

        serializable = {}
        for key, value in self.results.items():
            serializable[key] = value if isinstance(value, dict) else str(value)

        with open(output_path, "w") as f:
            json.dump(serializable, f, indent=2)

        return output_path

register_hooks abstractmethod

register_hooks()

Register hooks to capture activations during forward pass.

Source code in src/model_garage/analyze/base.py
@abstractmethod
def register_hooks(self) -> None:
    """Register hooks to capture activations during forward pass."""
    pass

analyze_activations abstractmethod

analyze_activations(inputs, labels)

Analyze activations in response to specific inputs.

Source code in src/model_garage/analyze/base.py
@abstractmethod
def analyze_activations(self, inputs: Any, labels: Any) -> Dict[str, Any]:
    """Analyze activations in response to specific inputs."""
    pass

identify_important_neurons abstractmethod

identify_important_neurons(threshold=0.5)

Identify neurons that strongly correlate with target behavior.

Source code in src/model_garage/analyze/base.py
@abstractmethod
def identify_important_neurons(self, threshold: float = 0.5) -> Dict[str, Any]:
    """Identify neurons that strongly correlate with target behavior."""
    pass

calculate_correlation

calculate_correlation(activations, outcomes)

Calculate correlation between neuron activations and outcomes.

Parameters:

Name Type Description Default
activations List[Dict[str, Any]]

List of activation maps per sample

required
outcomes List[float]

List of outcome values

required

Returns:

Type Description
Dict[str, List[Tuple[int, float]]]

Dict mapping layers to (neuron_idx, correlation) tuples

Source code in src/model_garage/analyze/base.py
def calculate_correlation(
    self,
    activations: List[Dict[str, Any]],
    outcomes: List[float],
) -> Dict[str, List[Tuple[int, float]]]:
    """
    Calculate correlation between neuron activations and outcomes.

    Args:
        activations: List of activation maps per sample
        outcomes: List of outcome values

    Returns:
        Dict mapping layers to (neuron_idx, correlation) tuples
    """
    outcomes_array = np.array(outcomes)
    correlations = {}

    for layer_name in activations[0]:
        layer_acts = []
        for sample_idx in range(len(activations)):
            if layer_name in activations[sample_idx]:
                layer_acts.append(activations[sample_idx][layer_name].flatten())

        if not layer_acts:
            continue

        layer_acts_array = np.array(layer_acts)
        layer_correlations = []

        for neuron_idx in range(layer_acts_array.shape[1]):
            neuron_acts = layer_acts_array[:, neuron_idx]
            try:
                corr = np.corrcoef(neuron_acts, outcomes_array)[0, 1]
                layer_correlations.append((neuron_idx, corr))
            except Exception as e:
                logger.warning(
                    f"Correlation failed for neuron {neuron_idx} in {layer_name}: {e}"
                )

        layer_correlations.sort(key=lambda x: abs(x[1]), reverse=True)
        correlations[layer_name] = layer_correlations

    return correlations

get_top_neurons

get_top_neurons(correlations, top_n=10)

Get the top neurons by correlation magnitude.

Source code in src/model_garage/analyze/base.py
def get_top_neurons(
    self,
    correlations: Dict[str, List[Tuple[int, float]]],
    top_n: int = 10,
) -> Dict[str, List[Tuple[int, float]]]:
    """Get the top neurons by correlation magnitude."""
    return {
        layer: sorted(corrs, key=lambda x: abs(x[1]), reverse=True)[:top_n]
        for layer, corrs in correlations.items()
    }

save_results

save_results(output_path)

Save analysis results to disk.

Source code in src/model_garage/analyze/base.py
def save_results(self, output_path: str) -> str:
    """Save analysis results to disk."""
    import json
    import os

    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    serializable = {}
    for key, value in self.results.items():
        serializable[key] = value if isinstance(value, dict) else str(value)

    with open(output_path, "w") as f:
        json.dump(serializable, f, indent=2)

    return output_path