import pandas as pd
import random
import json
import os
import openai
from openai import OpenAI
import time
import logging
import httpx
from datetime import datetime
#
# Read logging level from environment variable
logging_level = os.getenv('LOGGING_LEVEL', 'WARNING').upper()
# Configure logging with the level from the environment variable
logging.basicConfig(
level=getattr(logging, logging_level, logging.WARNING), # Default to WARNING if invalid level
format='%(asctime)s - %(levelname)s - %(message)s'
)
# Create a logger object
logger = logging.getLogger(__name__)
[docs]
class LLMEvaluation:
"""
A class for evaluating language model responses.
This class initializes an evaluation system for language models, particularly designed for assessing responses generated by models like GPT-3.5 Turbo. It can handle generation and evaluation of model responses, with options for customizing system prompts and handling timeouts.
Parameters
----------
model_name : str, optional
The name of the model to be used for generating responses. Default is "gpt-3.5-turbo".
openai_key : str, optional
The API key for authenticating requests to OpenAI. If not provided, it will attempt to use the key from environment variables.
generator_system_prompt : str, optional
Custom system prompt for generating responses. If not provided, a default prompt is used.
eval_system_prompt : str, optional
Custom system prompt for evaluation. If not provided, a default evaluation prompt is used.
enable_timeouts : bool, optional
Flag to enable or disable timeouts for API requests. Default is False.
timeouts_options : dict, optional
A dictionary specifying timeout options. Relevant only if `enable_timeouts` is True.
Raises
------
ValueError
If the OpenAI API key is invalid or not provided.
Attributes
----------
client : OpenAI
The OpenAI client configured for interaction with the model.
Examples
--------
>>> evaluator = LLMEvaluation(model_name="gpt-3.5-turbo", openai_key="your-api-key")
>>> evaluator.generator_system_prompt
"You are an intelligent assistant capable of generating responses based on prompts."
Notes
-----
The `openai_key` is essential for accessing OpenAI's API. Ensure the key is valid and has appropriate permissions.
"""
def __init__(self, model_name="gpt-3.5-turbo", openai_key=None, generator_system_prompt = None, eval_system_prompt=None, enable_timeouts= False, timeouts_options= None):
self.model_name = model_name
self.openai_api_key = openai_key if openai_key is not None else os.getenv("OPENAI_API_KEY")
if not self.openai_api_key or not self.openai_api_key.startswith("sk-"):
raise ValueError("Invalid OpenAI API key.")
self.client = OpenAI(api_key=self.openai_api_key, max_retries=3)
if enable_timeouts:
if timeouts_options is None:
timeouts_options = {"total": 120, "read": 60.0, "write": 60.0, "connect": 10.0}
self.client = self.client.with_options(timeout=httpx.Timeout(120.0, read=60.0, write=60.0, connect=10.0))
else:
self.client = self.client.with_options(timeout=httpx.Timeout(timeouts_options["total"], timeouts_options["read"], timeouts_options["write"], timeouts_options["connect"]))
if eval_system_prompt is None:
self.eval_system_prompt = """Your task is to compare responses generated by a model to correct responses.
These responses are compared to a correct response, and a single number (0.0-5.0) is generated for each model.
Please rate the generated response on a scale of 0.0 to 5.5, with 5.0 being perfectly correct and 0.0 being completely off.
The output should be a JSON with score based on the prompt,
Also justify your scoring in 12 words. The score_reason should not exceed 12 words and does not include any special characters. "
"The output must be in VALID JSONL format in one line."
score_reason should not exceed 12 words and does not include any special characters
Eg: {
\"score\": 4.25,
\"score_reason\": \"12 words reasons without special chars\"
}"""
else:
self.eval_system_prompt = eval_system_prompt
if generator_system_prompt is None:
self.generator_system_prompt = """You are an intelligent assistant capable of generating responses based on prompts."""
else:
self.generator_system_prompt = generator_system_prompt
[docs]
def save_dict_list_to_csv(self, data, output_file_path=None, output_folder='csv'):
"""
Converts a list of conversation data into a CSV file, categorizing data into columns for system prompts, user prompts, and assistant completions.
:method save_dict_list_to_csv: Process and save conversation data in a structured CSV format.
:type save_dict_list_to_csv: method
:param data: A list of dictionaries, each representing a conversation with messages categorized by roles ('system', 'user', 'assistant') and their respective content.
:type data: list
:param output_file_path: The file path for the output CSV file. Defaults to None, which uses a default filename.
:type output_file_path: str, optional
:param output_folder: The directory to save the output CSV file. Defaults to 'csv'.
:type output_folder: str, optional
:return: None. This method does not return anything but saves the processed data to a CSV file.
:rtype: None
:raises Exception: If any error occurs during the processing or file writing.
:example:
::
>>> data = [{'messages': [{'role': 'system', 'content': 'System message'}, {'role': 'user', 'content': 'User question'}, {'role': 'assistant', 'content': 'Assistant answer'}]}]
>>> save_dict_list_to_csv(data, output_file_path='output.csv')
# This will process the provided data and save it as 'output.csv' in the specified output folder.
:notes:
- The input data should be formatted correctly, with each conversation's messages having designated roles ('system', 'user', 'assistant').
"""
# Check if output folder exists, create if not
if not os.path.exists(output_folder):
os.makedirs(output_folder)
try:
# Process data to extract prompts and completions
processed_data = []
for conversation in data:
system_prompt = ""
user_prompt = ""
assistant_completion = ""
for message in conversation['messages']:
if message['role'] == 'system':
system_prompt = message['content']
elif message['role'] == 'user':
user_prompt = message['content']
elif message['role'] == 'assistant':
assistant_completion = message['content']
processed_data.append({
'system_prompt': system_prompt,
'prompt': user_prompt,
'completion': assistant_completion
})
# Create DataFrame and Save to CSV
df = pd.DataFrame(processed_data)
df.to_csv(output_file_path, index=False)
except Exception as e:
raise Exception(f"An error occurred during CSV conversion: {e}")
[docs]
def read_jsonl(self, file_path):
"""
Reads a JSONL (JSON Lines) file and returns the data as a list of dictionaries.
This method is designed to read and parse data from a JSONL file, where each line of the file is a separate JSON object.
It is particularly useful for processing datasets stored in the JSONL format, commonly used in data processing and machine learning tasks.
:param file_path: The path to the JSONL file to be read.
:type file_path: str
:return: A list of dictionaries, each representing a JSON object from a line in the JSONL file.
:rtype: List[dict]
:raises FileNotFoundError: If the specified file does not exist.
:raises json.JSONDecodeError: If any line in the file is not a valid JSON object.
:example:
::
>>> file_path = 'data.jsonl'
>>> data = read_jsonl(file_path)
>>> print(data[0]) # Display the first JSON object from the list.
"""
with open(file_path, 'r') as file:
return [json.loads(line) for line in file]
[docs]
def save_random_prompts(self, input_file, output_file = None, output_format='csv', n_samples=100, output_folder='output'):
"""
Selects random prompts from a given file and saves them in the specified format.
:method save_random_prompts: Extract random samples from a data file and save them in a specified format.
:type save_random_prompts: method
:param input_file: The path to the input file, which can be in CSV, JSON, or JSONL format.
:type input_file: str
:param output_file: The base name of the output file without extension. If None, a name with a timestamp and the number of samples will be generated. Defaults to None.
:type output_file: str, optional
:param output_format: The format for the output file, which can be 'csv', 'json', or 'jsonl'. Defaults to 'csv'.
:type output_format: str, optional
:param n_samples: The number of random samples to select from the input file. Defaults to 100.
:type n_samples: int, optional
:param output_folder: The folder where the output file should be saved. Defaults to 'output'.
:type output_folder: str, optional
:return: None. This method does not return anything but saves the extracted samples to a file.
:rtype: None
:raises ValueError: If the input file format is unsupported or if the output format is not one of 'csv', 'json', or 'jsonl'.
:raises Exception: If any other error occurs during the processing or file writing.
:example:
::
>>> save_random_prompts("data.csv", output_file="sample_prompts", output_format='csv', n_samples=50, output_folder='output')
# This will select 50 random prompts from 'data.csv' and save them as 'sample_prompts_[timestamp]_50.csv' in the 'output' folder.
:notes:
- Ensure that the input file is in one of the supported formats (CSV, JSON, or JSONL) for correct processing.
"""
try:
# Check if output folder exists, create if not
if not os.path.exists(output_folder):
os.makedirs(output_folder)
# Read data
if input_file.endswith('.csv'):
data = pd.read_csv(input_file)
data_dict = data.to_dict(orient='records')
elif input_file.endswith('.json'):
data_dict = pd.read_json(input_file, orient='records').to_dict(orient='records')
elif input_file.endswith('.jsonl'):
data_dict = self.read_jsonl(input_file)
else:
raise ValueError("Unsupported file format. Please provide a CSV, JSON, or JSONL file.")
# Select random samples
random_samples = random.sample(data_dict, min(n_samples, len(data_dict)))
if output_file == None:
output_file = "random_prompts"
# Construct output file path with timestamp
current_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
output_file_name = f"{output_file}_{current_time}_{n_samples}.{output_format}"
output_file_path = os.path.join(output_folder, output_file_name)
else:
output_file, _ = os.path.splitext(output_file)
# Construct output file path with timestamp
current_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
output_file_name = f"{output_file}_{current_time}_{n_samples}.{output_format}"
output_file_path = os.path.join(output_folder, output_file_name)
# Save data in the desired format
if output_format == 'csv':
if input_file.endswith('.jsonl') or input_file.endswith('.json'):
self.save_dict_list_to_csv(random_samples, output_file_path)
else:
df = pd.DataFrame(random_samples) # This creates a DataFrame with columns based on the keys in the dictionaries
df.to_csv(output_file_path, index=False) # Saves the DataFrame to a CSV file without the index column
elif output_format == 'json':
with open(output_file_path, 'w') as file:
json.dump(random_samples, file, indent=4)
elif output_format == 'jsonl':
with open(output_file_path, 'w') as file:
for item in random_samples:
file.write(json.dumps(item) + '\n')
else:
raise ValueError("[save_random_prompts] Unsupported output format. Please choose 'csv', 'json', or 'jsonl'.")
except Exception as e:
raise Exception (f"[save_random_prompts] Error: {e}")
[docs]
def rephrase_and_optionally_classify(self, prompt,
model_name="gpt-4",
classify=False, classes=None,
prompt_style='student-asking',
temperature=1, max_tokens=256,
top_p=1,
frequency_penalty=0,
presence_penalty=0):
"""
Rephrases a given prompt and optionally classifies it using a specified language model.
This method takes a sentence and rephrases it using the specified language model. It can also classify
the rephrased sentence into provided categories, if classification is requested.
:param prompt: The original sentence that needs to be rephrased.
:type prompt: str
:param model_name: The name of the language model to use, defaults to 'gpt-4'.
:type model_name: str, optional
:param classify: Indicates whether to classify the rephrased sentence, defaults to False.
:type classify: bool, optional
:param classes: The list of classification categories, used if classify is True, defaults to None.
:type classes: list of str, optional
:param prompt_style: The style for rephrasing the prompt, used if classify is False, defaults to 'student-asking'.
:type prompt_style: str, optional
:param temperature: Controls the randomness of the output, defaults to 1.
:type temperature: float, optional
:param max_tokens: The maximum number of tokens to generate, defaults to 256.
:type max_tokens: int, optional
:param top_p: Nucleus sampling parameter, defaults to 1.
:type top_p: float, optional
:param frequency_penalty: Adjusts frequency of token usage, defaults to 0.
:type frequency_penalty: float, optional
:param presence_penalty: Adjusts presence of tokens, defaults to 0.
:type presence_penalty: float, optional
:return: A tuple containing the rephrased prompt and its classification (or None if not classified).
:rtype: tuple
:raises ValueError: If unable to parse the model response as JSON.
:raises Exception: If an error occurs during the API request or processing.
:example:
::
>>> prompt = "What is AI?"
>>> rephrased, classification = rephrase_and_optionally_classify(prompt, classify=True, classes=["ACADEMIC", "RESEARCH"])
>>> print(f"Rephrased: {rephrased}, Classification: {classification}")
# Outputs the rephrased sentence and its classification.
"""
try:
# Default classes if not provided
if classes is None:
classes = ["ACADEMIC", "RESEARCH", "ADMIN", "SCIENCE", "OTHERS", "BUSINESS", "TECHNOLOGY", "HEALTH", "ENTERTAINMENT", "SPORTS"]
classes_str = ", ".join(classes)
if classify:
system_message = f"You are a helpful assistant to rephrase and classify sentences as {classes_str}."
user_message = f"Please rephrase: '{prompt}' and classify it into one of the categories: {classes_str}. Output format: '{{\"rephrased_prompt\" : \"value\", \"classification\" : \"value\"}}'"
else:
system_message = f"You are a helpful assistant that helps rephrase sentences in a '{prompt_style}' style."
user_message = f"Please rephrase the following sentence: '{prompt}'. Output format: '{{\"rephrased_prompt\" : \"value\"}}'"
response = self.client.chat.completions.create(
model=model_name,
messages=[
{"role": "system", "content": system_message},
{"role": "user", "content": user_message}
],
temperature=temperature,
max_tokens=max_tokens,
top_p=top_p,
frequency_penalty=frequency_penalty,
presence_penalty=presence_penalty,
stop=["\n"]
)
logger.debug(response.choices[0].message.content, "\n")
model_response = response.choices[0].message.content
if classify:
response_data = json.loads(model_response)
return response_data["rephrased_prompt"], response_data["classification"]
else:
logger.debug(model_response)
response_data = json.loads(model_response)
logger.debug(response_data)
return response_data["rephrased_prompt"], None
except json.JSONDecodeError:
raise ValueError("Failed to parse model response as JSON.")
except Exception as e:
raise Exception(f"An unexpected error occurred: {e}")
[docs]
def rephrase_and_classify_prompts_in_dataset(self, input_csv, output_csv, model_name="gpt-3.5-turbo", classify=False, classes=None):
"""
Processes and classifies prompts from an input CSV file and saves the results to an output CSV file.
:method rephrase_and_classify_prompts_in_dataset: Process prompts from an input CSV file, potentially classify them, and save the results in another CSV file.
:type rephrase_and_classify_prompts_in_dataset: method
:param input_csv: The path to the input CSV file containing prompts and their corresponding completions.
:type input_csv: str
:param output_csv: The path to the output CSV file where the processed data will be saved.
:type output_csv: str
:param model_name: The name of the language model to use for rephrasing prompts. Default is 'gpt-3.5-turbo'.
:type model_name: str, optional
:param classify: Flag indicating whether to classify the rephrased prompts. Default is False.
:type classify: bool, optional
:param classes: The list of classification categories to be used if classification is enabled. Default is None.
:type classes: list of str, optional
:return: None. The method does not return anything but saves the processed data to the specified output CSV file.
:rtype: None
:raises FileNotFoundError: If the input CSV file is not found.
:raises Exception: For any other exceptions that may occur during the processing or file writing.
:example:
::
>>> rephrase_and_classify_prompts_in_dataset("input_prompts.csv", "processed_prompts.csv", classify=True, classes=["class1", "class2"])
# This will read prompts from 'input_prompts.csv', process and optionally classify them, and save the results to 'processed_prompts.csv'.
:notes:
- The method expects the input CSV to have columns named 'prompt' and 'completion'.
- Classification is optional and is performed only if the 'classify' parameter is set to True.
"""
# Create the output folder if it doesn't exist
output_folder = os.path.dirname(output_csv)
if not os.path.exists(output_folder):
os.makedirs(output_folder)
# Load input data
try:
df_input = pd.read_csv(input_csv)
except FileNotFoundError as e:
logger.error(f"The file {input_csv} was not found: {e}")
return
except Exception as e:
logger.error(f"Error reading input CSV file: {e}")
return
# Creating a DataFrame for the output
df_output = pd.DataFrame(columns=['initial_prompt', 'prompt', 'ground_truth_completion', 'classification'])
for index, row in df_input.iterrows():
try:
initial_prompt = row['prompt']
completion = row['completion']
# Use the rephrase_and_optionally_classify method
rephrased_prompt, classification = self.rephrase_and_optionally_classify(initial_prompt, model_name=model_name, classify=classify, classes=classes)
#logger.debug(f"rephrased_prompt: {rephrased_prompt}")
#logger.debug(f"classification: {classification}")
new_row = {
"initial_prompt": initial_prompt,
"prompt": rephrased_prompt,
"ground_truth_completion": completion,
"classification": str(classification)
}
# Using pd.concat instead of append
df_output = pd.concat([df_output, pd.DataFrame([new_row])], ignore_index=True)
# To avoid hitting rate limits
time.sleep(0.5)
except Exception as e:
logger.error(f"Error processing row {index}: {e}")
# Save to CSV
try:
df_output.to_csv(output_csv, index=False)
except Exception as e:
logger.error(f"Error writing to output CSV file: {e}")
[docs]
def get_generated_completion(self,
finetuned_model,
prompt,
temperature=1.0,
max_tokens=256,
top_p=1.0,
finetuned_model_start_sequence_only_for_base_models=""):
"""
Retrieves the generated completion from a specified model based on the given prompt.
This method interacts with the OpenAI API to generate a completion based on the provided prompt and model parameters.
It is designed to work with both base and fine-tuned models, offering various customization options for the generation process.
:param finetuned_model: The name of the fine-tuned or base model to use for generating the completion.
:type finetuned_model: str
:param prompt: The input prompt to which the model generates a completion.
:type prompt: str
:param temperature: The sampling temperature, controlling the randomness of the output. Defaults to 1.0.
:type temperature: float, optional
:param max_tokens: The maximum number of tokens to generate in the response. Defaults to 256.
:type max_tokens: int, optional
:param top_p: The top-p sampling parameter, controlling the range of token probabilities considered for sampling. Defaults to 1.0.
:type top_p: float, optional
:param finetuned_model_start_sequence_only_for_base_models: A start sequence used only for base models, if applicable.
:type finetuned_model_start_sequence_only_for_base_models: str, optional
:return: The generated completion as a string.
:rtype: str
:raises ValueError: If an unknown model name is specified.
:raises Exception: If there is an error during the generation process or with the OpenAI API interaction.
:example:
::
>>> finetuned_model = 'gpt-3.5-turbo'
>>> prompt = 'Translate the following English text to French: Hello, how are you?'
>>> completion = get_generated_completion(finetuned_model, prompt)
>>> print(completion)
# Outputs the model-generated translation of the prompt.
"""
#logger.debug('eval_system_prompt', self.eval_system_prompt)
try:
if "gpt" in finetuned_model:
response = self.client.chat.completions.create(
model=finetuned_model,
messages=[
{"role": "system", "content": self.generator_system_prompt},
{"role": "user", "content": prompt}
],
temperature=temperature,
max_tokens=max_tokens,
top_p=top_p,
frequency_penalty=0,
presence_penalty=0,
stop=["END"]
)
return response.choices[0].message.content
else:
raise ValueError(f"Unknown model: {finetuned_model}")
except Exception as e:
raise Exception(f"Error in get_generated_completion: {e}")
[docs]
def score_response(self, prompt, ground_truth_completion, generated_completion, llm_evaluator_model_name="gpt-3.5-turbo"):
"""
Generates a scoring prompt, sends it to a language model, and parses the response to evaluate a given generated completion against a ground truth.
:method score_response: Evaluate the generated completion of a prompt against the ground truth completion using a large language model.
:type score_response: method
:param prompt: The original prompt used in generating the completion.
:type prompt: str
:param ground_truth_completion: The correct or expected response to the prompt.
:type ground_truth_completion: str
:param generated_completion: The response generated by the evaluated model.
:type generated_completion: str
:param llm_evaluator_model_name: Name of the large language model to be used for scoring. Defaults to "gpt-3.5-turbo".
:type llm_evaluator_model_name: str, optional
:return: A tuple containing the score (numeric) and the reasoning behind the score (string).
:rtype: tuple
:raises json.JSONDecodeError: If the response from the model is not in a valid JSON format.
:raises Exception: For any other exceptions that may occur during API calls or processing.
:example:
::
>>> score, reason = score_response("What is AI?", "Artificial Intelligence", "AI is a field in computer science.", "gpt-3.5-turbo")
# This evaluates the generated completion for accuracy and relevance, returning a score and reasoning.
:notes:
- The method constructs an evaluation prompt combining the original prompt, ground truth completion, and the generated completion.
- It then sends this prompt to the specified language model for scoring and parses the model's response to extract the score and reasoning.
"""
#system_prompt = "You are an intelligent assistant capable of evaluating responses based on accuracy and relevance."
input_prompt = (
f"The correct response to the prompt '{prompt}' was: '{ground_truth_completion}'. "
f"The response generated by the test model was: '{generated_completion}'. "
)
try:
response = self.client.chat.completions.create(
model=llm_evaluator_model_name,
messages=[
{"role": "system", "content": self.eval_system_prompt},
{"role": "user", "content": input_prompt},
{"role": "assistant", "content": ""}
],
)
response_content = json.loads(response.choices[0].message.content)
score = response_content["score"]
score_reason = response_content["score_reason"]
return score, score_reason
except json.JSONDecodeError:
raise json.JSONDecodeError("Failed to parse model response as JSON.")
except Exception as e:
raise Exception(f"An unexpected error occurred: {e}")
[docs]
def evaluate_model(self, finetuned_model, dataset_csv_path, results_csv_path,
temperature=1.0, max_tokens=150, top_p=1.0,
frequency_penalty=0, presence_penalty=0,
model_start_sequence="",
llm_evaluator_model_name="gpt-3.5-turbo", dataset_size=100,
finetuned_model_start_sequence_only_for_base_models="",
experiment_id=1,
save_immediately=False):
"""
Evaluates the performance of a specified model using a dataset and generates statistical insights.
This method assesses a model's ability to respond to prompts, comparing generated responses with expected completions.
It provides a systematic approach to evaluate model performance, saving the results for analysis.
:param finetuned_model: The name of the model to be evaluated.
:type finetuned_model: str
:param dataset_csv_path: Path to the CSV file containing prompts and expected completions for evaluation.
:type dataset_csv_path: str
:param results_csv_path: Path where the evaluation results will be saved.
:type results_csv_path: str
[Other parameters...]
:return: None. The evaluation results are saved to the specified CSV file.
:rtype: None
:raises FileNotFoundError: If the dataset CSV file is not found.
:raises ValueError: If the dataset CSV file is not properly formatted or missing required columns.
:raises Exception: For other exceptions that may occur during the evaluation process.
:example:
::
>>> finetuned_model = 'gpt-3.5-turbo'
>>> dataset_csv_path = 'path/to/dataset.csv'
>>> results_csv_path = 'path/to/results.csv'
>>> evaluate_model(finetuned_model, dataset_csv_path, results_csv_path)
# This will evaluate the model using the dataset and save results to the specified path.
"""
if not os.path.exists(dataset_csv_path):
raise FileNotFoundError(f"[evaluate_model] The input CSV file {dataset_csv_path} was not found.")
# Check if the file extension is .csv
if not dataset_csv_path.endswith('.csv'):
raise ValueError(f"[evaluate_model] The input file {dataset_csv_path} is not a CSV file. Make sure that input_csv of the dataset to evaluate is in a CSV Format.")
try:
df_input = pd.read_csv(dataset_csv_path)
# Required columns
required_columns = ['initial_prompt', 'prompt', 'ground_truth_completion', 'classification']
# Check if all required columns are present
if not all(column in df_input.columns for column in required_columns):
missing_columns = [column for column in required_columns if column not in df_input.columns]
raise ValueError(f"The CSV file is missing the following required columns: {', '.join(missing_columns)}")
df_input = pd.read_csv(dataset_csv_path)
all_frames = []
for index, row in df_input.iterrows():
prompt = row["prompt"]
ground_truth_completion = row["ground_truth_completion"]
logger.debug(prompt)
generated_completion = self.get_generated_completion(finetuned_model, prompt, temperature, max_tokens, top_p,
finetuned_model_start_sequence_only_for_base_models)
#logger.debug("*******> generated_completion: ", generated_completion)
# Assuming the existence of a scoring function here
score, scoring_reason = self.score_response(prompt, ground_truth_completion, generated_completion)
# Get the current timestamp
current_time = datetime.now()
timestamp_str = current_time.strftime("%Y%m%d%H%M%S")
# Add the timestamp to the output_data dictionary
output_data = {
"Timestamp": timestamp_str,
"Prompt": prompt,
"Ground Truth": row["ground_truth_completion"],
"Generated Response": generated_completion,
"Temperature": temperature,
"Max Tokens": max_tokens,
"Top P": top_p,
"Frequency Penalty": frequency_penalty,
"Presence Penalty": presence_penalty,
"Score": score,
"Classification": row["classification"],
"Model Name": finetuned_model,
"Dataset Size": dataset_size,
"GPT Eval Model": llm_evaluator_model_name,
"Scoring Reason": scoring_reason,
"Experiment_ID": experiment_id
}
# Append the dictionary to the list
all_frames.append(output_data)
# Print for debugging
# Corrected logger.debug statement:
logger.debug("row[%s] Score: %s Prompt: %s %s", index, score, prompt[:50], finetuned_model)
# Similarly, update other logger.debug statements
logger.debug("ground_truth_completion: %s", ground_truth_completion[:100])
logger.debug("generated_completion: %s", generated_completion[:100])
logger.debug("scoring: %s", score)
logger.debug("scoring_reason: %s", scoring_reason)
# Append the dictionary to the list
# Save the data immediately if specified
if save_immediately:
# Check if the output CSV file already exists
file_exists = os.path.exists(results_csv_path)
# Append the new data to the CSV file
mode = 'a' if file_exists else 'w'
header = not file_exists # Write header only if file doesn't exist
pd.DataFrame([output_data]).to_csv(results_csv_path, mode=mode, header=header, index=False)
# To avoid hitting rate limits
time.sleep(1.0)
if not save_immediately:
# Create a DataFrame from the list of dictionaries
df = pd.DataFrame(all_frames)
# Save the DataFrame to a CSV file
df.to_csv(results_csv_path, index=False)
# Generate statistics and save them
#self.save_stats(final_df, psugpt_model)
except pd.errors.ParserError as e:
raise ValueError(f"The file {dataset_csv_path} could not be parsed as CSV: {e}")
except openai.APIConnectionError as e:
logger.error(f"[evaluate_model] APIConnectionError error:\n{e}")
except openai.RateLimitError as e:
# If the request fails due to rate error limit, increment the retry counter, sleep for 0.5 seconds, and then try again
logger.error(f"[evaluate_model] RateLimit Error {e}. Trying again in 0.5 seconds...")
except openai.APIStatusError as e:
logger.error(f"[evaluate_model] APIStatusError:\n{e}")
# If the request fails due to service unavailability, sleep for 10 seconds and then try again without incrementing the retry counter
except AttributeError as e:
logger.error(f"[evaluate_model] AttributeError:\n{e}")
# You can also add additional error handling code here if needed
except Exception as e:
raise Exception(f"An error occurred during model evaluation: {e}")