Skip to content

Judges

Evaluators that determine trial winners.


MetricsJudge

orc.judges.metrics_judge.MetricsJudge

Judge that evaluates based on objective metrics.

Uses configurable weights for different metrics to calculate a final score for each submission.

Example

judge = MetricsJudge( weights={"accuracy": 0.5, "latency": 0.3, "cost": 0.2}, accuracy_checker=my_accuracy_function, )

verdict = await judge.evaluate(task, submissions)

Source code in orc/judges/metrics_judge.py
class MetricsJudge:
    """
    Judge that evaluates based on objective metrics.

    Uses configurable weights for different metrics to calculate
    a final score for each submission.

    Example:
        judge = MetricsJudge(
            weights={"accuracy": 0.5, "latency": 0.3, "cost": 0.2},
            accuracy_checker=my_accuracy_function,
        )

        verdict = await judge.evaluate(task, submissions)
    """

    def __init__(
        self,
        weights: Optional[Dict[str, float]] = None,
        accuracy_checker: Optional[Callable] = None,
        latency_threshold_ms: int = 5000,
        cost_threshold: float = 0.10,
    ):
        """
        Initialize the Metrics Judge.

        Args:
            weights: Metric weights (must sum to 1.0).
            accuracy_checker: Function to check result accuracy (returns 0.0-1.0).
            latency_threshold_ms: Latency above this gets score 0.
            cost_threshold: Cost above this gets score 0.
        """
        self.weights = weights or {"accuracy": 0.5, "latency": 0.3, "cost": 0.2}
        self.accuracy_checker = accuracy_checker
        self.latency_threshold_ms = latency_threshold_ms
        self.cost_threshold = cost_threshold

    async def evaluate(
        self,
        task: str,
        submissions: List[Submission],
    ) -> Verdict:
        """
        Evaluate submissions based on metrics.

        Args:
            task: The original task description.
            submissions: List of agent submissions.

        Returns:
            Verdict with the winner and scores.
        """
        scores: Dict[str, float] = {}
        details: Dict[str, Dict[str, float]] = {}

        for sub in submissions:
            agent_scores = {}

            # Accuracy score
            if self.accuracy_checker:
                try:
                    accuracy = await self.accuracy_checker(task, sub.result)
                except Exception:
                    accuracy = 0.5  # Default if checker fails
            else:
                # Default: success = 1.0, failure = 0.0
                if hasattr(sub.result, "is_success"):
                    accuracy = 1.0 if sub.result.is_success else 0.0
                else:
                    accuracy = 0.5
            agent_scores["accuracy"] = accuracy

            # Latency score (lower is better)
            if sub.latency_ms is not None:
                if sub.latency_ms >= self.latency_threshold_ms:
                    latency_score = 0.0
                else:
                    latency_score = 1.0 - (sub.latency_ms / self.latency_threshold_ms)
            else:
                latency_score = 0.5  # Unknown latency
            agent_scores["latency"] = latency_score

            # Cost score (lower is better)
            if sub.cost is not None:
                if sub.cost >= self.cost_threshold:
                    cost_score = 0.0
                else:
                    cost_score = 1.0 - (sub.cost / self.cost_threshold)
            else:
                cost_score = 0.5  # Unknown cost
            agent_scores["cost"] = cost_score

            # Calculate weighted total
            total = sum(
                agent_scores.get(metric, 0) * weight
                for metric, weight in self.weights.items()
            )

            scores[sub.agent] = total
            details[sub.agent] = agent_scores

        # Determine winner
        winner = max(scores, key=scores.get)

        # Check for tie (within 5% margin)
        sorted_scores = sorted(scores.values(), reverse=True)
        if len(sorted_scores) >= 2:
            margin = sorted_scores[0] - sorted_scores[1]
            is_tie = margin < 0.05
        else:
            is_tie = False

        return Verdict(
            winner="tie" if is_tie else winner,
            reasoning=self._build_reasoning(details, winner),
            scores=scores,
            confidence=0.9 if not is_tie else 0.5,
            metadata={"metric_details": details},
        )

    def _build_reasoning(
        self,
        details: Dict[str, Dict[str, float]],
        winner: str,
    ) -> str:
        """Build human-readable reasoning from scores."""
        lines = [f"Winner: {winner}", "", "Score breakdown:"]

        for agent, metrics in details.items():
            lines.append(f"\n{agent}:")
            for metric, score in metrics.items():
                weight = self.weights.get(metric, 0)
                lines.append(f"  {metric}: {score:.2f} (weight: {weight})")

        return "\n".join(lines)

__init__(weights=None, accuracy_checker=None, latency_threshold_ms=5000, cost_threshold=0.1)

Initialize the Metrics Judge.

Parameters:

Name Type Description Default
weights Optional[Dict[str, float]]

Metric weights (must sum to 1.0).

None
accuracy_checker Optional[Callable]

Function to check result accuracy (returns 0.0-1.0).

None
latency_threshold_ms int

Latency above this gets score 0.

5000
cost_threshold float

Cost above this gets score 0.

0.1
Source code in orc/judges/metrics_judge.py
def __init__(
    self,
    weights: Optional[Dict[str, float]] = None,
    accuracy_checker: Optional[Callable] = None,
    latency_threshold_ms: int = 5000,
    cost_threshold: float = 0.10,
):
    """
    Initialize the Metrics Judge.

    Args:
        weights: Metric weights (must sum to 1.0).
        accuracy_checker: Function to check result accuracy (returns 0.0-1.0).
        latency_threshold_ms: Latency above this gets score 0.
        cost_threshold: Cost above this gets score 0.
    """
    self.weights = weights or {"accuracy": 0.5, "latency": 0.3, "cost": 0.2}
    self.accuracy_checker = accuracy_checker
    self.latency_threshold_ms = latency_threshold_ms
    self.cost_threshold = cost_threshold

evaluate(task, submissions) async

Evaluate submissions based on metrics.

Parameters:

Name Type Description Default
task str

The original task description.

required
submissions List[Submission]

List of agent submissions.

required

Returns:

Type Description
Verdict

Verdict with the winner and scores.

Source code in orc/judges/metrics_judge.py
async def evaluate(
    self,
    task: str,
    submissions: List[Submission],
) -> Verdict:
    """
    Evaluate submissions based on metrics.

    Args:
        task: The original task description.
        submissions: List of agent submissions.

    Returns:
        Verdict with the winner and scores.
    """
    scores: Dict[str, float] = {}
    details: Dict[str, Dict[str, float]] = {}

    for sub in submissions:
        agent_scores = {}

        # Accuracy score
        if self.accuracy_checker:
            try:
                accuracy = await self.accuracy_checker(task, sub.result)
            except Exception:
                accuracy = 0.5  # Default if checker fails
        else:
            # Default: success = 1.0, failure = 0.0
            if hasattr(sub.result, "is_success"):
                accuracy = 1.0 if sub.result.is_success else 0.0
            else:
                accuracy = 0.5
        agent_scores["accuracy"] = accuracy

        # Latency score (lower is better)
        if sub.latency_ms is not None:
            if sub.latency_ms >= self.latency_threshold_ms:
                latency_score = 0.0
            else:
                latency_score = 1.0 - (sub.latency_ms / self.latency_threshold_ms)
        else:
            latency_score = 0.5  # Unknown latency
        agent_scores["latency"] = latency_score

        # Cost score (lower is better)
        if sub.cost is not None:
            if sub.cost >= self.cost_threshold:
                cost_score = 0.0
            else:
                cost_score = 1.0 - (sub.cost / self.cost_threshold)
        else:
            cost_score = 0.5  # Unknown cost
        agent_scores["cost"] = cost_score

        # Calculate weighted total
        total = sum(
            agent_scores.get(metric, 0) * weight
            for metric, weight in self.weights.items()
        )

        scores[sub.agent] = total
        details[sub.agent] = agent_scores

    # Determine winner
    winner = max(scores, key=scores.get)

    # Check for tie (within 5% margin)
    sorted_scores = sorted(scores.values(), reverse=True)
    if len(sorted_scores) >= 2:
        margin = sorted_scores[0] - sorted_scores[1]
        is_tie = margin < 0.05
    else:
        is_tie = False

    return Verdict(
        winner="tie" if is_tie else winner,
        reasoning=self._build_reasoning(details, winner),
        scores=scores,
        confidence=0.9 if not is_tie else 0.5,
        metadata={"metric_details": details},
    )

LLMJudge

orc.judges.llm_judge.LLMJudge

Judge that uses an LLM to evaluate submissions.

The LLM compares agent outputs based on specified criteria and determines which agent performed better.

Example

llm = OllamaProvider(model="qwen2.5:72b") judge = LLMJudge( llm, criteria=["accuracy", "completeness", "efficiency"], )

verdict = await judge.evaluate(task, submissions)

Source code in orc/judges/llm_judge.py
class LLMJudge:
    """
    Judge that uses an LLM to evaluate submissions.

    The LLM compares agent outputs based on specified criteria
    and determines which agent performed better.

    Example:
        llm = OllamaProvider(model="qwen2.5:72b")
        judge = LLMJudge(
            llm,
            criteria=["accuracy", "completeness", "efficiency"],
        )

        verdict = await judge.evaluate(task, submissions)
    """

    def __init__(
        self,
        llm: LLMProvider,
        criteria: Optional[List[str]] = None,
        system_prompt: Optional[str] = None,
    ):
        """
        Initialize the LLM Judge.

        Args:
            llm: LLM provider for evaluation.
            criteria: Evaluation criteria (default: accuracy, completeness, clarity).
            system_prompt: Custom system prompt for evaluation.
        """
        self.llm = llm
        self.criteria = criteria or ["accuracy", "completeness", "clarity", "efficiency"]
        self.system_prompt = system_prompt or self._default_system_prompt()

    def _default_system_prompt(self) -> str:
        criteria_str = ", ".join(self.criteria)
        return f"""You are an impartial judge evaluating agent submissions.

Your task is to compare two agent submissions for the same task and determine which one is better.

Evaluation criteria: {criteria_str}

You must respond with valid JSON in this format:
{{
    "winner": "A" or "B",
    "reasoning": "Detailed explanation of why this submission is better",
    "scores": {{
        "A": 0.0-1.0,
        "B": 0.0-1.0
    }},
    "confidence": 0.0-1.0
}}

Be fair and objective. If submissions are truly equivalent,
you may declare a tie by setting winner to "tie".
"""

    async def evaluate(
        self,
        task: str,
        submissions: List[Submission],
    ) -> Verdict:
        """
        Evaluate submissions and determine a winner.

        Args:
            task: The original task description.
            submissions: List of agent submissions (typically 2).

        Returns:
            Verdict with the winner and reasoning.
        """
        if len(submissions) < 2:
            # Only one submission - automatic winner
            return Verdict(
                winner=submissions[0].agent,
                reasoning="Only one submission provided",
                scores={submissions[0].agent: 1.0},
                confidence=1.0,
            )

        # Build evaluation prompt
        sub_a = submissions[0]
        sub_b = submissions[1]

        user_prompt = f"""Task: {task}

Submission A ({sub_a.agent}):
{self._format_result(sub_a)}

Submission B ({sub_b.agent}):
{self._format_result(sub_b)}

Which submission better accomplishes the task? Evaluate based on: {", ".join(self.criteria)}"""

        # Call LLM
        response = await self.llm.complete(
            messages=[
                LLMMessage(role="system", content=self.system_prompt),
                LLMMessage(role="user", content=user_prompt),
            ],
            temperature=0.1,
            json_mode=True,
        )

        # Parse response
        try:
            data = json.loads(response.content)
            winner_key = data.get("winner", "A")

            if winner_key.upper() == "A":
                winner = sub_a.agent
            elif winner_key.upper() == "B":
                winner = sub_b.agent
            elif winner_key.lower() == "tie":
                winner = "tie"
            else:
                winner = sub_a.agent  # Default to A

            # Map scores to agent names
            scores = {}
            raw_scores = data.get("scores", {})
            if "A" in raw_scores:
                scores[sub_a.agent] = raw_scores["A"]
            if "B" in raw_scores:
                scores[sub_b.agent] = raw_scores["B"]

            return Verdict(
                winner=winner,
                reasoning=data.get("reasoning", ""),
                scores=scores,
                confidence=data.get("confidence", 0.8),
            )

        except (json.JSONDecodeError, KeyError) as e:
            # Fallback: try to extract winner from text
            content = response.content.lower()
            if "submission b" in content or "agent b" in content:
                winner = sub_b.agent
            else:
                winner = sub_a.agent

            return Verdict(
                winner=winner,
                reasoning=response.content,
                confidence=0.5,
                metadata={"parse_error": str(e)},
            )

    def _format_result(self, submission: Submission) -> str:
        """Format a submission for the evaluation prompt."""
        result = submission.result

        # Handle TaskResult
        if hasattr(result, "to_dict"):
            result_data = result.to_dict()
        elif hasattr(result, "data"):
            result_data = result.data
        else:
            result_data = result

        parts = [f"Result: {result_data}"]

        if submission.latency_ms:
            parts.append(f"Latency: {submission.latency_ms}ms")

        if submission.cost:
            parts.append(f"Cost: ${submission.cost:.4f}")

        return "\n".join(parts)

__init__(llm, criteria=None, system_prompt=None)

Initialize the LLM Judge.

Parameters:

Name Type Description Default
llm LLMProvider

LLM provider for evaluation.

required
criteria Optional[List[str]]

Evaluation criteria (default: accuracy, completeness, clarity).

None
system_prompt Optional[str]

Custom system prompt for evaluation.

None
Source code in orc/judges/llm_judge.py
def __init__(
    self,
    llm: LLMProvider,
    criteria: Optional[List[str]] = None,
    system_prompt: Optional[str] = None,
):
    """
    Initialize the LLM Judge.

    Args:
        llm: LLM provider for evaluation.
        criteria: Evaluation criteria (default: accuracy, completeness, clarity).
        system_prompt: Custom system prompt for evaluation.
    """
    self.llm = llm
    self.criteria = criteria or ["accuracy", "completeness", "clarity", "efficiency"]
    self.system_prompt = system_prompt or self._default_system_prompt()

evaluate(task, submissions) async

Evaluate submissions and determine a winner.

Parameters:

Name Type Description Default
task str

The original task description.

required
submissions List[Submission]

List of agent submissions (typically 2).

required

Returns:

Type Description
Verdict

Verdict with the winner and reasoning.

Source code in orc/judges/llm_judge.py
    async def evaluate(
        self,
        task: str,
        submissions: List[Submission],
    ) -> Verdict:
        """
        Evaluate submissions and determine a winner.

        Args:
            task: The original task description.
            submissions: List of agent submissions (typically 2).

        Returns:
            Verdict with the winner and reasoning.
        """
        if len(submissions) < 2:
            # Only one submission - automatic winner
            return Verdict(
                winner=submissions[0].agent,
                reasoning="Only one submission provided",
                scores={submissions[0].agent: 1.0},
                confidence=1.0,
            )

        # Build evaluation prompt
        sub_a = submissions[0]
        sub_b = submissions[1]

        user_prompt = f"""Task: {task}

Submission A ({sub_a.agent}):
{self._format_result(sub_a)}

Submission B ({sub_b.agent}):
{self._format_result(sub_b)}

Which submission better accomplishes the task? Evaluate based on: {", ".join(self.criteria)}"""

        # Call LLM
        response = await self.llm.complete(
            messages=[
                LLMMessage(role="system", content=self.system_prompt),
                LLMMessage(role="user", content=user_prompt),
            ],
            temperature=0.1,
            json_mode=True,
        )

        # Parse response
        try:
            data = json.loads(response.content)
            winner_key = data.get("winner", "A")

            if winner_key.upper() == "A":
                winner = sub_a.agent
            elif winner_key.upper() == "B":
                winner = sub_b.agent
            elif winner_key.lower() == "tie":
                winner = "tie"
            else:
                winner = sub_a.agent  # Default to A

            # Map scores to agent names
            scores = {}
            raw_scores = data.get("scores", {})
            if "A" in raw_scores:
                scores[sub_a.agent] = raw_scores["A"]
            if "B" in raw_scores:
                scores[sub_b.agent] = raw_scores["B"]

            return Verdict(
                winner=winner,
                reasoning=data.get("reasoning", ""),
                scores=scores,
                confidence=data.get("confidence", 0.8),
            )

        except (json.JSONDecodeError, KeyError) as e:
            # Fallback: try to extract winner from text
            content = response.content.lower()
            if "submission b" in content or "agent b" in content:
                winner = sub_b.agent
            else:
                winner = sub_a.agent

            return Verdict(
                winner=winner,
                reasoning=response.content,
                confidence=0.5,
                metadata={"parse_error": str(e)},
            )

ConsensusJudge

orc.judges.consensus_judge.ConsensusJudge

Judge that aggregates votes from multiple sub-judges.

Useful for reducing bias and increasing reliability.

Example

judge = ConsensusJudge([ LLMJudge(llm1), LLMJudge(llm2), MetricsJudge(), ])

verdict = await judge.evaluate(task, submissions)

Source code in orc/judges/consensus_judge.py
class ConsensusJudge:
    """
    Judge that aggregates votes from multiple sub-judges.

    Useful for reducing bias and increasing reliability.

    Example:
        judge = ConsensusJudge([
            LLMJudge(llm1),
            LLMJudge(llm2),
            MetricsJudge(),
        ])

        verdict = await judge.evaluate(task, submissions)
    """

    def __init__(
        self,
        judges: List[Judge],
        require_majority: bool = True,
        tiebreaker: str = "first",  # "first", "random", or judge name
    ):
        """
        Initialize the Consensus Judge.

        Args:
            judges: List of judges to vote.
            require_majority: If True, winner needs >50% votes.
            tiebreaker: How to break ties ("first" judge, "random", or specific judge name).
        """
        self.judges = judges
        self.require_majority = require_majority
        self.tiebreaker = tiebreaker

    async def evaluate(
        self,
        task: str,
        submissions: List[Submission],
    ) -> Verdict:
        """
        Evaluate by collecting votes from all judges.

        Args:
            task: The original task description.
            submissions: List of agent submissions.

        Returns:
            Verdict with the consensus winner.
        """
        # Collect verdicts from all judges in parallel
        verdicts = await asyncio.gather(*[
            judge.evaluate(task, submissions)
            for judge in self.judges
        ])

        # Tally votes
        votes: Dict[str, int] = {}
        vote_details: List[Dict] = []

        for i, verdict in enumerate(verdicts):
            winner = verdict.winner
            votes[winner] = votes.get(winner, 0) + 1
            vote_details.append({
                "judge_index": i,
                "winner": winner,
                "confidence": verdict.confidence,
                "reasoning": verdict.reasoning[:200],  # Truncate
            })

        # Determine winner
        total_votes = len(verdicts)
        max_votes = max(votes.values()) if votes else 0
        winners = [agent for agent, count in votes.items() if count == max_votes]

        if len(winners) == 1:
            winner = winners[0]
            confidence = max_votes / total_votes
        elif self.require_majority and max_votes <= total_votes / 2:
            # No majority - tie
            winner = "tie"
            confidence = 0.5
        else:
            # Tiebreaker
            if self.tiebreaker == "first":
                # Use first judge's decision among tied winners
                for verdict in verdicts:
                    if verdict.winner in winners:
                        winner = verdict.winner
                        break
                else:
                    winner = winners[0]
            elif self.tiebreaker == "random":
                import random
                winner = random.choice(winners)
            else:
                # Use specific judge
                winner = winners[0]

            confidence = max_votes / total_votes

        # Aggregate scores
        aggregated_scores: Dict[str, float] = {}
        for verdict in verdicts:
            for agent, score in verdict.scores.items():
                if agent not in aggregated_scores:
                    aggregated_scores[agent] = 0
                aggregated_scores[agent] += score / len(verdicts)

        # Build reasoning
        reasoning_lines = [
            f"Consensus decision: {winner}",
            f"Vote distribution: {votes}",
            "",
            "Individual verdicts:",
        ]
        for detail in vote_details:
            reasoning_lines.append(
                f"  Judge {detail['judge_index']}: {detail['winner']} "
                f"(confidence: {detail['confidence']:.2f})"
            )

        return Verdict(
            winner=winner,
            reasoning="\n".join(reasoning_lines),
            scores=aggregated_scores,
            confidence=confidence,
            metadata={
                "votes": votes,
                "vote_details": vote_details,
                "num_judges": len(self.judges),
            },
        )

__init__(judges, require_majority=True, tiebreaker='first')

Initialize the Consensus Judge.

Parameters:

Name Type Description Default
judges List[Judge]

List of judges to vote.

required
require_majority bool

If True, winner needs >50% votes.

True
tiebreaker str

How to break ties ("first" judge, "random", or specific judge name).

'first'
Source code in orc/judges/consensus_judge.py
def __init__(
    self,
    judges: List[Judge],
    require_majority: bool = True,
    tiebreaker: str = "first",  # "first", "random", or judge name
):
    """
    Initialize the Consensus Judge.

    Args:
        judges: List of judges to vote.
        require_majority: If True, winner needs >50% votes.
        tiebreaker: How to break ties ("first" judge, "random", or specific judge name).
    """
    self.judges = judges
    self.require_majority = require_majority
    self.tiebreaker = tiebreaker

evaluate(task, submissions) async

Evaluate by collecting votes from all judges.

Parameters:

Name Type Description Default
task str

The original task description.

required
submissions List[Submission]

List of agent submissions.

required

Returns:

Type Description
Verdict

Verdict with the consensus winner.

Source code in orc/judges/consensus_judge.py
async def evaluate(
    self,
    task: str,
    submissions: List[Submission],
) -> Verdict:
    """
    Evaluate by collecting votes from all judges.

    Args:
        task: The original task description.
        submissions: List of agent submissions.

    Returns:
        Verdict with the consensus winner.
    """
    # Collect verdicts from all judges in parallel
    verdicts = await asyncio.gather(*[
        judge.evaluate(task, submissions)
        for judge in self.judges
    ])

    # Tally votes
    votes: Dict[str, int] = {}
    vote_details: List[Dict] = []

    for i, verdict in enumerate(verdicts):
        winner = verdict.winner
        votes[winner] = votes.get(winner, 0) + 1
        vote_details.append({
            "judge_index": i,
            "winner": winner,
            "confidence": verdict.confidence,
            "reasoning": verdict.reasoning[:200],  # Truncate
        })

    # Determine winner
    total_votes = len(verdicts)
    max_votes = max(votes.values()) if votes else 0
    winners = [agent for agent, count in votes.items() if count == max_votes]

    if len(winners) == 1:
        winner = winners[0]
        confidence = max_votes / total_votes
    elif self.require_majority and max_votes <= total_votes / 2:
        # No majority - tie
        winner = "tie"
        confidence = 0.5
    else:
        # Tiebreaker
        if self.tiebreaker == "first":
            # Use first judge's decision among tied winners
            for verdict in verdicts:
                if verdict.winner in winners:
                    winner = verdict.winner
                    break
            else:
                winner = winners[0]
        elif self.tiebreaker == "random":
            import random
            winner = random.choice(winners)
        else:
            # Use specific judge
            winner = winners[0]

        confidence = max_votes / total_votes

    # Aggregate scores
    aggregated_scores: Dict[str, float] = {}
    for verdict in verdicts:
        for agent, score in verdict.scores.items():
            if agent not in aggregated_scores:
                aggregated_scores[agent] = 0
            aggregated_scores[agent] += score / len(verdicts)

    # Build reasoning
    reasoning_lines = [
        f"Consensus decision: {winner}",
        f"Vote distribution: {votes}",
        "",
        "Individual verdicts:",
    ]
    for detail in vote_details:
        reasoning_lines.append(
            f"  Judge {detail['judge_index']}: {detail['winner']} "
            f"(confidence: {detail['confidence']:.2f})"
        )

    return Verdict(
        winner=winner,
        reasoning="\n".join(reasoning_lines),
        scores=aggregated_scores,
        confidence=confidence,
        metadata={
            "votes": votes,
            "vote_details": vote_details,
            "num_judges": len(self.judges),
        },
    )