Source code for scalexi.llm_evaluation.evaluate

import pandas as pd
import random
import json
import os
import openai
from openai import OpenAI
import time 
import logging
import httpx
from datetime import datetime
#
# Read logging level from environment variable
logging_level = os.getenv('LOGGING_LEVEL', 'WARNING').upper()

# Configure logging with the level from the environment variable
logging.basicConfig(
    level=getattr(logging, logging_level, logging.WARNING),  # Default to WARNING if invalid level
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# Create a logger object
logger = logging.getLogger(__name__)

[docs] class LLMEvaluation: """ A class for evaluating language model responses. This class initializes an evaluation system for language models, particularly designed for assessing responses generated by models like GPT-3.5 Turbo. It can handle generation and evaluation of model responses, with options for customizing system prompts and handling timeouts. Parameters ---------- model_name : str, optional The name of the model to be used for generating responses. Default is "gpt-3.5-turbo". openai_key : str, optional The API key for authenticating requests to OpenAI. If not provided, it will attempt to use the key from environment variables. generator_system_prompt : str, optional Custom system prompt for generating responses. If not provided, a default prompt is used. eval_system_prompt : str, optional Custom system prompt for evaluation. If not provided, a default evaluation prompt is used. enable_timeouts : bool, optional Flag to enable or disable timeouts for API requests. Default is False. timeouts_options : dict, optional A dictionary specifying timeout options. Relevant only if `enable_timeouts` is True. Raises ------ ValueError If the OpenAI API key is invalid or not provided. Attributes ---------- client : OpenAI The OpenAI client configured for interaction with the model. Examples -------- >>> evaluator = LLMEvaluation(model_name="gpt-3.5-turbo", openai_key="your-api-key") >>> evaluator.generator_system_prompt "You are an intelligent assistant capable of generating responses based on prompts." Notes ----- The `openai_key` is essential for accessing OpenAI's API. Ensure the key is valid and has appropriate permissions. """ def __init__(self, model_name="gpt-3.5-turbo", openai_key=None, generator_system_prompt = None, eval_system_prompt=None, enable_timeouts= False, timeouts_options= None): self.model_name = model_name self.openai_api_key = openai_key if openai_key is not None else os.getenv("OPENAI_API_KEY") if not self.openai_api_key or not self.openai_api_key.startswith("sk-"): raise ValueError("Invalid OpenAI API key.") self.client = OpenAI(api_key=self.openai_api_key, max_retries=3) if enable_timeouts: if timeouts_options is None: timeouts_options = {"total": 120, "read": 60.0, "write": 60.0, "connect": 10.0} self.client = self.client.with_options(timeout=httpx.Timeout(120.0, read=60.0, write=60.0, connect=10.0)) else: self.client = self.client.with_options(timeout=httpx.Timeout(timeouts_options["total"], timeouts_options["read"], timeouts_options["write"], timeouts_options["connect"])) if eval_system_prompt is None: self.eval_system_prompt = """Your task is to compare responses generated by a model to correct responses. These responses are compared to a correct response, and a single number (0.0-5.0) is generated for each model. Please rate the generated response on a scale of 0.0 to 5.5, with 5.0 being perfectly correct and 0.0 being completely off. The output should be a JSON with score based on the prompt, Also justify your scoring in 12 words. The score_reason should not exceed 12 words and does not include any special characters. " "The output must be in VALID JSONL format in one line." score_reason should not exceed 12 words and does not include any special characters Eg: { \"score\": 4.25, \"score_reason\": \"12 words reasons without special chars\" }""" else: self.eval_system_prompt = eval_system_prompt if generator_system_prompt is None: self.generator_system_prompt = """You are an intelligent assistant capable of generating responses based on prompts.""" else: self.generator_system_prompt = generator_system_prompt
[docs] def save_dict_list_to_csv(self, data, output_file_path=None, output_folder='csv'): """ Converts a list of conversation data into a CSV file, categorizing data into columns for system prompts, user prompts, and assistant completions. :method save_dict_list_to_csv: Process and save conversation data in a structured CSV format. :type save_dict_list_to_csv: method :param data: A list of dictionaries, each representing a conversation with messages categorized by roles ('system', 'user', 'assistant') and their respective content. :type data: list :param output_file_path: The file path for the output CSV file. Defaults to None, which uses a default filename. :type output_file_path: str, optional :param output_folder: The directory to save the output CSV file. Defaults to 'csv'. :type output_folder: str, optional :return: None. This method does not return anything but saves the processed data to a CSV file. :rtype: None :raises Exception: If any error occurs during the processing or file writing. :example: :: >>> data = [{'messages': [{'role': 'system', 'content': 'System message'}, {'role': 'user', 'content': 'User question'}, {'role': 'assistant', 'content': 'Assistant answer'}]}] >>> save_dict_list_to_csv(data, output_file_path='output.csv') # This will process the provided data and save it as 'output.csv' in the specified output folder. :notes: - The input data should be formatted correctly, with each conversation's messages having designated roles ('system', 'user', 'assistant'). """ # Check if output folder exists, create if not if not os.path.exists(output_folder): os.makedirs(output_folder) try: # Process data to extract prompts and completions processed_data = [] for conversation in data: system_prompt = "" user_prompt = "" assistant_completion = "" for message in conversation['messages']: if message['role'] == 'system': system_prompt = message['content'] elif message['role'] == 'user': user_prompt = message['content'] elif message['role'] == 'assistant': assistant_completion = message['content'] processed_data.append({ 'system_prompt': system_prompt, 'prompt': user_prompt, 'completion': assistant_completion }) # Create DataFrame and Save to CSV df = pd.DataFrame(processed_data) df.to_csv(output_file_path, index=False) except Exception as e: raise Exception(f"An error occurred during CSV conversion: {e}")
[docs] def read_jsonl(self, file_path): """ Reads a JSONL (JSON Lines) file and returns the data as a list of dictionaries. This method is designed to read and parse data from a JSONL file, where each line of the file is a separate JSON object. It is particularly useful for processing datasets stored in the JSONL format, commonly used in data processing and machine learning tasks. :param file_path: The path to the JSONL file to be read. :type file_path: str :return: A list of dictionaries, each representing a JSON object from a line in the JSONL file. :rtype: List[dict] :raises FileNotFoundError: If the specified file does not exist. :raises json.JSONDecodeError: If any line in the file is not a valid JSON object. :example: :: >>> file_path = 'data.jsonl' >>> data = read_jsonl(file_path) >>> print(data[0]) # Display the first JSON object from the list. """ with open(file_path, 'r') as file: return [json.loads(line) for line in file]
[docs] def save_random_prompts(self, input_file, output_file = None, output_format='csv', n_samples=100, output_folder='output'): """ Selects random prompts from a given file and saves them in the specified format. :method save_random_prompts: Extract random samples from a data file and save them in a specified format. :type save_random_prompts: method :param input_file: The path to the input file, which can be in CSV, JSON, or JSONL format. :type input_file: str :param output_file: The base name of the output file without extension. If None, a name with a timestamp and the number of samples will be generated. Defaults to None. :type output_file: str, optional :param output_format: The format for the output file, which can be 'csv', 'json', or 'jsonl'. Defaults to 'csv'. :type output_format: str, optional :param n_samples: The number of random samples to select from the input file. Defaults to 100. :type n_samples: int, optional :param output_folder: The folder where the output file should be saved. Defaults to 'output'. :type output_folder: str, optional :return: None. This method does not return anything but saves the extracted samples to a file. :rtype: None :raises ValueError: If the input file format is unsupported or if the output format is not one of 'csv', 'json', or 'jsonl'. :raises Exception: If any other error occurs during the processing or file writing. :example: :: >>> save_random_prompts("data.csv", output_file="sample_prompts", output_format='csv', n_samples=50, output_folder='output') # This will select 50 random prompts from 'data.csv' and save them as 'sample_prompts_[timestamp]_50.csv' in the 'output' folder. :notes: - Ensure that the input file is in one of the supported formats (CSV, JSON, or JSONL) for correct processing. """ try: # Check if output folder exists, create if not if not os.path.exists(output_folder): os.makedirs(output_folder) # Read data if input_file.endswith('.csv'): data = pd.read_csv(input_file) data_dict = data.to_dict(orient='records') elif input_file.endswith('.json'): data_dict = pd.read_json(input_file, orient='records').to_dict(orient='records') elif input_file.endswith('.jsonl'): data_dict = self.read_jsonl(input_file) else: raise ValueError("Unsupported file format. Please provide a CSV, JSON, or JSONL file.") # Select random samples random_samples = random.sample(data_dict, min(n_samples, len(data_dict))) if output_file == None: output_file = "random_prompts" # Construct output file path with timestamp current_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') output_file_name = f"{output_file}_{current_time}_{n_samples}.{output_format}" output_file_path = os.path.join(output_folder, output_file_name) else: output_file, _ = os.path.splitext(output_file) # Construct output file path with timestamp current_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') output_file_name = f"{output_file}_{current_time}_{n_samples}.{output_format}" output_file_path = os.path.join(output_folder, output_file_name) # Save data in the desired format if output_format == 'csv': if input_file.endswith('.jsonl') or input_file.endswith('.json'): self.save_dict_list_to_csv(random_samples, output_file_path) else: df = pd.DataFrame(random_samples) # This creates a DataFrame with columns based on the keys in the dictionaries df.to_csv(output_file_path, index=False) # Saves the DataFrame to a CSV file without the index column elif output_format == 'json': with open(output_file_path, 'w') as file: json.dump(random_samples, file, indent=4) elif output_format == 'jsonl': with open(output_file_path, 'w') as file: for item in random_samples: file.write(json.dumps(item) + '\n') else: raise ValueError("[save_random_prompts] Unsupported output format. Please choose 'csv', 'json', or 'jsonl'.") except Exception as e: raise Exception (f"[save_random_prompts] Error: {e}")
[docs] def rephrase_and_optionally_classify(self, prompt, model_name="gpt-4", classify=False, classes=None, prompt_style='student-asking', temperature=1, max_tokens=256, top_p=1, frequency_penalty=0, presence_penalty=0): """ Rephrases a given prompt and optionally classifies it using a specified language model. This method takes a sentence and rephrases it using the specified language model. It can also classify the rephrased sentence into provided categories, if classification is requested. :param prompt: The original sentence that needs to be rephrased. :type prompt: str :param model_name: The name of the language model to use, defaults to 'gpt-4'. :type model_name: str, optional :param classify: Indicates whether to classify the rephrased sentence, defaults to False. :type classify: bool, optional :param classes: The list of classification categories, used if classify is True, defaults to None. :type classes: list of str, optional :param prompt_style: The style for rephrasing the prompt, used if classify is False, defaults to 'student-asking'. :type prompt_style: str, optional :param temperature: Controls the randomness of the output, defaults to 1. :type temperature: float, optional :param max_tokens: The maximum number of tokens to generate, defaults to 256. :type max_tokens: int, optional :param top_p: Nucleus sampling parameter, defaults to 1. :type top_p: float, optional :param frequency_penalty: Adjusts frequency of token usage, defaults to 0. :type frequency_penalty: float, optional :param presence_penalty: Adjusts presence of tokens, defaults to 0. :type presence_penalty: float, optional :return: A tuple containing the rephrased prompt and its classification (or None if not classified). :rtype: tuple :raises ValueError: If unable to parse the model response as JSON. :raises Exception: If an error occurs during the API request or processing. :example: :: >>> prompt = "What is AI?" >>> rephrased, classification = rephrase_and_optionally_classify(prompt, classify=True, classes=["ACADEMIC", "RESEARCH"]) >>> print(f"Rephrased: {rephrased}, Classification: {classification}") # Outputs the rephrased sentence and its classification. """ try: # Default classes if not provided if classes is None: classes = ["ACADEMIC", "RESEARCH", "ADMIN", "SCIENCE", "OTHERS", "BUSINESS", "TECHNOLOGY", "HEALTH", "ENTERTAINMENT", "SPORTS"] classes_str = ", ".join(classes) if classify: system_message = f"You are a helpful assistant to rephrase and classify sentences as {classes_str}." user_message = f"Please rephrase: '{prompt}' and classify it into one of the categories: {classes_str}. Output format: '{{\"rephrased_prompt\" : \"value\", \"classification\" : \"value\"}}'" else: system_message = f"You are a helpful assistant that helps rephrase sentences in a '{prompt_style}' style." user_message = f"Please rephrase the following sentence: '{prompt}'. Output format: '{{\"rephrased_prompt\" : \"value\"}}'" response = self.client.chat.completions.create( model=model_name, messages=[ {"role": "system", "content": system_message}, {"role": "user", "content": user_message} ], temperature=temperature, max_tokens=max_tokens, top_p=top_p, frequency_penalty=frequency_penalty, presence_penalty=presence_penalty, stop=["\n"] ) logger.debug(response.choices[0].message.content, "\n") model_response = response.choices[0].message.content if classify: response_data = json.loads(model_response) return response_data["rephrased_prompt"], response_data["classification"] else: logger.debug(model_response) response_data = json.loads(model_response) logger.debug(response_data) return response_data["rephrased_prompt"], None except json.JSONDecodeError: raise ValueError("Failed to parse model response as JSON.") except Exception as e: raise Exception(f"An unexpected error occurred: {e}")
[docs] def rephrase_and_classify_prompts_in_dataset(self, input_csv, output_csv, model_name="gpt-3.5-turbo", classify=False, classes=None): """ Processes and classifies prompts from an input CSV file and saves the results to an output CSV file. :method rephrase_and_classify_prompts_in_dataset: Process prompts from an input CSV file, potentially classify them, and save the results in another CSV file. :type rephrase_and_classify_prompts_in_dataset: method :param input_csv: The path to the input CSV file containing prompts and their corresponding completions. :type input_csv: str :param output_csv: The path to the output CSV file where the processed data will be saved. :type output_csv: str :param model_name: The name of the language model to use for rephrasing prompts. Default is 'gpt-3.5-turbo'. :type model_name: str, optional :param classify: Flag indicating whether to classify the rephrased prompts. Default is False. :type classify: bool, optional :param classes: The list of classification categories to be used if classification is enabled. Default is None. :type classes: list of str, optional :return: None. The method does not return anything but saves the processed data to the specified output CSV file. :rtype: None :raises FileNotFoundError: If the input CSV file is not found. :raises Exception: For any other exceptions that may occur during the processing or file writing. :example: :: >>> rephrase_and_classify_prompts_in_dataset("input_prompts.csv", "processed_prompts.csv", classify=True, classes=["class1", "class2"]) # This will read prompts from 'input_prompts.csv', process and optionally classify them, and save the results to 'processed_prompts.csv'. :notes: - The method expects the input CSV to have columns named 'prompt' and 'completion'. - Classification is optional and is performed only if the 'classify' parameter is set to True. """ # Create the output folder if it doesn't exist output_folder = os.path.dirname(output_csv) if not os.path.exists(output_folder): os.makedirs(output_folder) # Load input data try: df_input = pd.read_csv(input_csv) except FileNotFoundError as e: logger.error(f"The file {input_csv} was not found: {e}") return except Exception as e: logger.error(f"Error reading input CSV file: {e}") return # Creating a DataFrame for the output df_output = pd.DataFrame(columns=['initial_prompt', 'prompt', 'ground_truth_completion', 'classification']) for index, row in df_input.iterrows(): try: initial_prompt = row['prompt'] completion = row['completion'] # Use the rephrase_and_optionally_classify method rephrased_prompt, classification = self.rephrase_and_optionally_classify(initial_prompt, model_name=model_name, classify=classify, classes=classes) #logger.debug(f"rephrased_prompt: {rephrased_prompt}") #logger.debug(f"classification: {classification}") new_row = { "initial_prompt": initial_prompt, "prompt": rephrased_prompt, "ground_truth_completion": completion, "classification": str(classification) } # Using pd.concat instead of append df_output = pd.concat([df_output, pd.DataFrame([new_row])], ignore_index=True) # To avoid hitting rate limits time.sleep(0.5) except Exception as e: logger.error(f"Error processing row {index}: {e}") # Save to CSV try: df_output.to_csv(output_csv, index=False) except Exception as e: logger.error(f"Error writing to output CSV file: {e}")
[docs] def get_generated_completion(self, finetuned_model, prompt, temperature=1.0, max_tokens=256, top_p=1.0, finetuned_model_start_sequence_only_for_base_models=""): """ Retrieves the generated completion from a specified model based on the given prompt. This method interacts with the OpenAI API to generate a completion based on the provided prompt and model parameters. It is designed to work with both base and fine-tuned models, offering various customization options for the generation process. :param finetuned_model: The name of the fine-tuned or base model to use for generating the completion. :type finetuned_model: str :param prompt: The input prompt to which the model generates a completion. :type prompt: str :param temperature: The sampling temperature, controlling the randomness of the output. Defaults to 1.0. :type temperature: float, optional :param max_tokens: The maximum number of tokens to generate in the response. Defaults to 256. :type max_tokens: int, optional :param top_p: The top-p sampling parameter, controlling the range of token probabilities considered for sampling. Defaults to 1.0. :type top_p: float, optional :param finetuned_model_start_sequence_only_for_base_models: A start sequence used only for base models, if applicable. :type finetuned_model_start_sequence_only_for_base_models: str, optional :return: The generated completion as a string. :rtype: str :raises ValueError: If an unknown model name is specified. :raises Exception: If there is an error during the generation process or with the OpenAI API interaction. :example: :: >>> finetuned_model = 'gpt-3.5-turbo' >>> prompt = 'Translate the following English text to French: Hello, how are you?' >>> completion = get_generated_completion(finetuned_model, prompt) >>> print(completion) # Outputs the model-generated translation of the prompt. """ #logger.debug('eval_system_prompt', self.eval_system_prompt) try: if "gpt" in finetuned_model: response = self.client.chat.completions.create( model=finetuned_model, messages=[ {"role": "system", "content": self.generator_system_prompt}, {"role": "user", "content": prompt} ], temperature=temperature, max_tokens=max_tokens, top_p=top_p, frequency_penalty=0, presence_penalty=0, stop=["END"] ) return response.choices[0].message.content else: raise ValueError(f"Unknown model: {finetuned_model}") except Exception as e: raise Exception(f"Error in get_generated_completion: {e}")
[docs] def score_response(self, prompt, ground_truth_completion, generated_completion, llm_evaluator_model_name="gpt-3.5-turbo"): """ Generates a scoring prompt, sends it to a language model, and parses the response to evaluate a given generated completion against a ground truth. :method score_response: Evaluate the generated completion of a prompt against the ground truth completion using a large language model. :type score_response: method :param prompt: The original prompt used in generating the completion. :type prompt: str :param ground_truth_completion: The correct or expected response to the prompt. :type ground_truth_completion: str :param generated_completion: The response generated by the evaluated model. :type generated_completion: str :param llm_evaluator_model_name: Name of the large language model to be used for scoring. Defaults to "gpt-3.5-turbo". :type llm_evaluator_model_name: str, optional :return: A tuple containing the score (numeric) and the reasoning behind the score (string). :rtype: tuple :raises json.JSONDecodeError: If the response from the model is not in a valid JSON format. :raises Exception: For any other exceptions that may occur during API calls or processing. :example: :: >>> score, reason = score_response("What is AI?", "Artificial Intelligence", "AI is a field in computer science.", "gpt-3.5-turbo") # This evaluates the generated completion for accuracy and relevance, returning a score and reasoning. :notes: - The method constructs an evaluation prompt combining the original prompt, ground truth completion, and the generated completion. - It then sends this prompt to the specified language model for scoring and parses the model's response to extract the score and reasoning. """ #system_prompt = "You are an intelligent assistant capable of evaluating responses based on accuracy and relevance." input_prompt = ( f"The correct response to the prompt '{prompt}' was: '{ground_truth_completion}'. " f"The response generated by the test model was: '{generated_completion}'. " ) try: response = self.client.chat.completions.create( model=llm_evaluator_model_name, messages=[ {"role": "system", "content": self.eval_system_prompt}, {"role": "user", "content": input_prompt}, {"role": "assistant", "content": ""} ], ) response_content = json.loads(response.choices[0].message.content) score = response_content["score"] score_reason = response_content["score_reason"] return score, score_reason except json.JSONDecodeError: raise json.JSONDecodeError("Failed to parse model response as JSON.") except Exception as e: raise Exception(f"An unexpected error occurred: {e}")
[docs] def evaluate_model(self, finetuned_model, dataset_csv_path, results_csv_path, temperature=1.0, max_tokens=150, top_p=1.0, frequency_penalty=0, presence_penalty=0, model_start_sequence="", llm_evaluator_model_name="gpt-3.5-turbo", dataset_size=100, finetuned_model_start_sequence_only_for_base_models="", experiment_id=1, save_immediately=False): """ Evaluates the performance of a specified model using a dataset and generates statistical insights. This method assesses a model's ability to respond to prompts, comparing generated responses with expected completions. It provides a systematic approach to evaluate model performance, saving the results for analysis. :param finetuned_model: The name of the model to be evaluated. :type finetuned_model: str :param dataset_csv_path: Path to the CSV file containing prompts and expected completions for evaluation. :type dataset_csv_path: str :param results_csv_path: Path where the evaluation results will be saved. :type results_csv_path: str [Other parameters...] :return: None. The evaluation results are saved to the specified CSV file. :rtype: None :raises FileNotFoundError: If the dataset CSV file is not found. :raises ValueError: If the dataset CSV file is not properly formatted or missing required columns. :raises Exception: For other exceptions that may occur during the evaluation process. :example: :: >>> finetuned_model = 'gpt-3.5-turbo' >>> dataset_csv_path = 'path/to/dataset.csv' >>> results_csv_path = 'path/to/results.csv' >>> evaluate_model(finetuned_model, dataset_csv_path, results_csv_path) # This will evaluate the model using the dataset and save results to the specified path. """ if not os.path.exists(dataset_csv_path): raise FileNotFoundError(f"[evaluate_model] The input CSV file {dataset_csv_path} was not found.") # Check if the file extension is .csv if not dataset_csv_path.endswith('.csv'): raise ValueError(f"[evaluate_model] The input file {dataset_csv_path} is not a CSV file. Make sure that input_csv of the dataset to evaluate is in a CSV Format.") try: df_input = pd.read_csv(dataset_csv_path) # Required columns required_columns = ['initial_prompt', 'prompt', 'ground_truth_completion', 'classification'] # Check if all required columns are present if not all(column in df_input.columns for column in required_columns): missing_columns = [column for column in required_columns if column not in df_input.columns] raise ValueError(f"The CSV file is missing the following required columns: {', '.join(missing_columns)}") df_input = pd.read_csv(dataset_csv_path) all_frames = [] for index, row in df_input.iterrows(): prompt = row["prompt"] ground_truth_completion = row["ground_truth_completion"] logger.debug(prompt) generated_completion = self.get_generated_completion(finetuned_model, prompt, temperature, max_tokens, top_p, finetuned_model_start_sequence_only_for_base_models) #logger.debug("*******> generated_completion: ", generated_completion) # Assuming the existence of a scoring function here score, scoring_reason = self.score_response(prompt, ground_truth_completion, generated_completion) # Get the current timestamp current_time = datetime.now() timestamp_str = current_time.strftime("%Y%m%d%H%M%S") # Add the timestamp to the output_data dictionary output_data = { "Timestamp": timestamp_str, "Prompt": prompt, "Ground Truth": row["ground_truth_completion"], "Generated Response": generated_completion, "Temperature": temperature, "Max Tokens": max_tokens, "Top P": top_p, "Frequency Penalty": frequency_penalty, "Presence Penalty": presence_penalty, "Score": score, "Classification": row["classification"], "Model Name": finetuned_model, "Dataset Size": dataset_size, "GPT Eval Model": llm_evaluator_model_name, "Scoring Reason": scoring_reason, "Experiment_ID": experiment_id } # Append the dictionary to the list all_frames.append(output_data) # Print for debugging # Corrected logger.debug statement: logger.debug("row[%s] Score: %s Prompt: %s %s", index, score, prompt[:50], finetuned_model) # Similarly, update other logger.debug statements logger.debug("ground_truth_completion: %s", ground_truth_completion[:100]) logger.debug("generated_completion: %s", generated_completion[:100]) logger.debug("scoring: %s", score) logger.debug("scoring_reason: %s", scoring_reason) # Append the dictionary to the list # Save the data immediately if specified if save_immediately: # Check if the output CSV file already exists file_exists = os.path.exists(results_csv_path) # Append the new data to the CSV file mode = 'a' if file_exists else 'w' header = not file_exists # Write header only if file doesn't exist pd.DataFrame([output_data]).to_csv(results_csv_path, mode=mode, header=header, index=False) # To avoid hitting rate limits time.sleep(1.0) if not save_immediately: # Create a DataFrame from the list of dictionaries df = pd.DataFrame(all_frames) # Save the DataFrame to a CSV file df.to_csv(results_csv_path, index=False) # Generate statistics and save them #self.save_stats(final_df, psugpt_model) except pd.errors.ParserError as e: raise ValueError(f"The file {dataset_csv_path} could not be parsed as CSV: {e}") except openai.APIConnectionError as e: logger.error(f"[evaluate_model] APIConnectionError error:\n{e}") except openai.RateLimitError as e: # If the request fails due to rate error limit, increment the retry counter, sleep for 0.5 seconds, and then try again logger.error(f"[evaluate_model] RateLimit Error {e}. Trying again in 0.5 seconds...") except openai.APIStatusError as e: logger.error(f"[evaluate_model] APIStatusError:\n{e}") # If the request fails due to service unavailability, sleep for 10 seconds and then try again without incrementing the retry counter except AttributeError as e: logger.error(f"[evaluate_model] AttributeError:\n{e}") # You can also add additional error handling code here if needed except Exception as e: raise Exception(f"An error occurred during model evaluation: {e}")