diff --git a/src/pymatlib/core/data_handler.py b/src/pymatlib/core/data_handler.py index e14bc71c8f9922c865006a9b31c39d1f36da1116..0ce948709c0725c82c99614884d411c8c51886fa 100644 --- a/src/pymatlib/core/data_handler.py +++ b/src/pymatlib/core/data_handler.py @@ -109,24 +109,35 @@ def read_data_from_file(file_config: Union[str, Dict], header: bool = True) -> T Args: file_config: Either a path string or a dictionary containing file configuration - If dictionary, required keys: + If string (direct path): + - File must have exactly 2 columns + - First column is temperature, second is property + If dictionary: - file: Path to data file - temp_col: Temperature column name/index - prop_col: Property column name/index - If string, treated as direct file path header (bool): Indicates if the file contains a header row. Returns: Tuple[np.ndarray, np.ndarray]: Temperature and property arrays + + Raises: + ValueError: If: + - For direct path: Data doesn't have exactly two columns + - For dictionary config: Specified column names don't match headers + - Data contains NaN values + - Data contains duplicate temperature entries """ # Handle string (direct path) or dictionary configuration if isinstance(file_config, str): file_path = file_config + direct_path = True # For direct file paths, assume first two columns are temperature and property temp_col = 0 prop_col = 1 else: file_path = file_config['file'] + direct_path = False temp_col = file_config['temp_col'] prop_col = file_config['prop_col'] @@ -138,35 +149,111 @@ def read_data_from_file(file_config: Union[str, Dict], header: bool = True) -> T # Use pandas read_csv for CSV files df = pd.read_csv(file_path, header=0 if header else None) else: - # For txt files, assume columns are space/tab separated - data = np.loadtxt(file_path, dtype=float, skiprows=1 if header else 0) - if data.ndim != 2: - raise ValueError("Data should be two-dimensional") - - # Handle both column name (which would be an index for txt files) and column index - if isinstance(temp_col, int): - temp = data[:, temp_col] - else: - temp = data[:, 0] # Default to first column - - if isinstance(prop_col, int): - prop = data[:, prop_col] + # For txt files + if header: + # Read the header line to get column names + with open(file_path, 'r') as f: + header_line = f.readline().strip() + column_names = header_line.split() + + # Now read the data + data = np.loadtxt(file_path, dtype=float, skiprows=1) + + # Direct path case - check for exactly 2 columns + if direct_path: + if data.shape[1] != 2: + raise ValueError(f"Data should have exactly two columns, but found {data.shape[1]} columns") + temp = data[:, 0] + prop = data[:, 1] + # Dictionary case - match column names + else: + # Handle temperature column + if isinstance(temp_col, str): + if temp_col in column_names: + temp_idx = column_names.index(temp_col) + else: + raise ValueError(f"Temperature column '{temp_col}' not found in file. " + f"Available columns: {', '.join(column_names)}") + else: + if temp_col >= data.shape[1]: + raise ValueError(f"Temperature column index {temp_col} out of bounds (file has {data.shape[1]} columns)") + temp_idx = temp_col + + # Handle property column + if isinstance(prop_col, str): + if prop_col in column_names: + prop_idx = column_names.index(prop_col) + else: + raise ValueError(f"Property column '{prop_col}' not found in file. " + f"Available columns: {', '.join(column_names)}") + else: + if prop_col >= data.shape[1]: + raise ValueError(f"Property column index {prop_col} out of bounds (file has {data.shape[1]} columns)") + prop_idx = prop_col + + temp = data[:, temp_idx] + prop = data[:, prop_idx] else: - prop = data[:, 1] # Default to second column + # No header + data = np.loadtxt(file_path, dtype=float, skiprows=0) + + # Direct path case - check for exactly 2 columns + if direct_path: + if data.shape[1] != 2: + raise ValueError(f"Data should have exactly two columns, but found {data.shape[1]} columns") + temp = data[:, 0] + prop = data[:, 1] + # Dictionary case - use column indices + else: + if isinstance(temp_col, str): + raise ValueError(f"Column name '{temp_col}' specified, but file has no header row") + if isinstance(prop_col, str): + raise ValueError(f"Column name '{prop_col}' specified, but file has no header row") + + if temp_col >= data.shape[1]: + raise ValueError(f"Temperature column index {temp_col} out of bounds (file has {data.shape[1]} columns)") + if prop_col >= data.shape[1]: + raise ValueError(f"Property column index {prop_col} out of bounds (file has {data.shape[1]} columns)") + + temp = data[:, temp_col] + prop = data[:, prop_col] + + # Check for NaN values + if np.any(np.isnan(temp)) or np.any(np.isnan(prop)): + nan_rows = np.where(np.isnan(temp) | np.isnan(prop))[0] + 1 + raise ValueError(f"Data contains NaN values in rows: {', '.join(map(str, nan_rows))}") + + # Check for duplicate temperatures + unique_temp, counts = np.unique(temp, return_counts=True) + duplicates = unique_temp[counts > 1] + if len(duplicates) > 0: + duplicate_rows = [str(idx + 1) for idx, value in enumerate(temp) if value in duplicates] + raise ValueError(f"Duplicate temperature entries found in rows: {', '.join(duplicate_rows)}") - # Skip the pandas processing below return temp, prop # Process pandas DataFrame (for both Excel and CSV) # Handle both column name (string) and column index (integer) if isinstance(temp_col, str): - temp = df[temp_col].to_numpy(dtype=np.float64) + if temp_col in df.columns: + temp = df[temp_col].to_numpy(dtype=np.float64) + else: + raise ValueError(f"Temperature column '{temp_col}' not found in file. " + f"Available columns: {', '.join(df.columns)}") else: + if temp_col >= len(df.columns): + raise ValueError(f"Temperature column index {temp_col} out of bounds (file has {len(df.columns)} columns)") temp = df.iloc[:, temp_col].to_numpy(dtype=np.float64) if isinstance(prop_col, str): - prop = df[prop_col].to_numpy(dtype=np.float64) + if prop_col in df.columns: + prop = df[prop_col].to_numpy(dtype=np.float64) + else: + raise ValueError(f"Property column '{prop_col}' not found in file. " + f"Available columns: {', '.join(df.columns)}") else: + if prop_col >= len(df.columns): + raise ValueError(f"Property column index {prop_col} out of bounds (file has {len(df.columns)} columns)") prop = df.iloc[:, prop_col].to_numpy(dtype=np.float64) # Check for NaN values diff --git a/tests/python/test_data_handler.py b/tests/python/test_data_handler.py index b5577fbf658d734920e7606497741298f21fe1bf..58bc42ec219cc750217fcbbd7963364743589958 100644 --- a/tests/python/test_data_handler.py +++ b/tests/python/test_data_handler.py @@ -1,7 +1,7 @@ import pytest import numpy as np from pymatlib.core.data_handler import ( - read_data, celsius_to_kelvin, fahrenheit_to_kelvin, + read_data_from_txt, celsius_to_kelvin, fahrenheit_to_kelvin, thousand_times ) @@ -35,7 +35,7 @@ def test_read_data(tmp_path): 3.0 30.0 """) - temp, prop = read_data(str(test_file)) + temp, prop = read_data_from_txt(str(test_file)) assert np.allclose(temp, [1.0, 2.0, 3.0]) assert np.allclose(prop, [10.0, 20.0, 30.0]) @@ -48,7 +48,7 @@ NaN 20.0 """) with pytest.raises(ValueError, match="Data contains NaN values in rows: 2"): - read_data(str(invalid_file)) + read_data_from_txt(str(invalid_file)) def test_read_data_errors(tmp_path): """Test error handling in read_data function.""" @@ -77,10 +77,10 @@ NaN 20.0 # Test each error case with pytest.raises(ValueError, match="Data should have exactly two columns"): - temp, prop = read_data(str(wrong_columns)) + temp, prop = read_data_from_txt(str(wrong_columns)) with pytest.raises(ValueError, match="Data contains NaN values"): - temp, prop = read_data(str(nan_file)) + temp, prop = read_data_from_txt(str(nan_file)) with pytest.raises(ValueError, match="Duplicate temperature entries found"): - temp, prop = read_data(str(duplicate_file)) + temp, prop = read_data_from_txt(str(duplicate_file))