Braintrust
Braintrust is an enterprise-grade stack for building AI products including: evaluations, prompt playground, dataset management, tracing, etc.
Braintrust provides a Typescript and Python library to run and log evaluations and integrates well with Chroma.
Example evaluation script in Python: (refer to the tutorial above to get the full implementation)
from autoevals.llm import *
from braintrust import Eval
PROJECT_NAME="Chroma_Eval"
from openai import OpenAI
client = OpenAI()
leven_evaluator = LevenshteinScorer()
async def pipeline_a(input, hooks=None):
    # Get a relevant fact from Chroma
    relevant = collection.query(
        query_texts=[input],
        n_results=1,
    )
    relevant_text = ','.join(relevant["documents"][0])
    prompt = """
        You are an assistant called BT. Help the user.
        Relevant information: {relevant}
        Question: {question}
        Answer:
        """.format(question=input, relevant=relevant_text)
    messages = [{"role": "system", "content": prompt}]
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=messages,
        temperature=0,
        max_tokens=100,
    )
    result = response.choices[0].message.content
    return result
# Run an evaluation and log to Braintrust
await Eval(
    PROJECT_NAME,
    # define your test cases
    data = lambda:[{"input": "What is my eye color?", "expected": "Brown"}], 
    # define your retrieval pipeline w/ Chroma above
    task = pipeline_a, 
    # use a prebuilt scoring function or define your own :)
    scores=[leven_evaluator], 
)
Learn more: docs.