Skip to content
Snippets Groups Projects
Commit 30b9acc8 authored by Rahil Doshi's avatar Rahil Doshi
Browse files

Merge branch 'development' into release/v0.1.0

parents 9da883bc 285160dc
No related branches found
No related tags found
No related merge requests found
Pipeline #74877 passed
...@@ -109,24 +109,35 @@ def read_data_from_file(file_config: Union[str, Dict], header: bool = True) -> T ...@@ -109,24 +109,35 @@ def read_data_from_file(file_config: Union[str, Dict], header: bool = True) -> T
Args: Args:
file_config: Either a path string or a dictionary containing file configuration file_config: Either a path string or a dictionary containing file configuration
If dictionary, required keys: If string (direct path):
- File must have exactly 2 columns
- First column is temperature, second is property
If dictionary:
- file: Path to data file - file: Path to data file
- temp_col: Temperature column name/index - temp_col: Temperature column name/index
- prop_col: Property column name/index - prop_col: Property column name/index
If string, treated as direct file path
header (bool): Indicates if the file contains a header row. header (bool): Indicates if the file contains a header row.
Returns: Returns:
Tuple[np.ndarray, np.ndarray]: Temperature and property arrays Tuple[np.ndarray, np.ndarray]: Temperature and property arrays
Raises:
ValueError: If:
- For direct path: Data doesn't have exactly two columns
- For dictionary config: Specified column names don't match headers
- Data contains NaN values
- Data contains duplicate temperature entries
""" """
# Handle string (direct path) or dictionary configuration # Handle string (direct path) or dictionary configuration
if isinstance(file_config, str): if isinstance(file_config, str):
file_path = file_config file_path = file_config
direct_path = True
# For direct file paths, assume first two columns are temperature and property # For direct file paths, assume first two columns are temperature and property
temp_col = 0 temp_col = 0
prop_col = 1 prop_col = 1
else: else:
file_path = file_config['file'] file_path = file_config['file']
direct_path = False
temp_col = file_config['temp_col'] temp_col = file_config['temp_col']
prop_col = file_config['prop_col'] prop_col = file_config['prop_col']
...@@ -138,35 +149,111 @@ def read_data_from_file(file_config: Union[str, Dict], header: bool = True) -> T ...@@ -138,35 +149,111 @@ def read_data_from_file(file_config: Union[str, Dict], header: bool = True) -> T
# Use pandas read_csv for CSV files # Use pandas read_csv for CSV files
df = pd.read_csv(file_path, header=0 if header else None) df = pd.read_csv(file_path, header=0 if header else None)
else: else:
# For txt files, assume columns are space/tab separated # For txt files
data = np.loadtxt(file_path, dtype=float, skiprows=1 if header else 0) if header:
if data.ndim != 2: # Read the header line to get column names
raise ValueError("Data should be two-dimensional") with open(file_path, 'r') as f:
header_line = f.readline().strip()
# Handle both column name (which would be an index for txt files) and column index column_names = header_line.split()
if isinstance(temp_col, int):
temp = data[:, temp_col] # Now read the data
else: data = np.loadtxt(file_path, dtype=float, skiprows=1)
temp = data[:, 0] # Default to first column
# Direct path case - check for exactly 2 columns
if isinstance(prop_col, int): if direct_path:
prop = data[:, prop_col] if data.shape[1] != 2:
raise ValueError(f"Data should have exactly two columns, but found {data.shape[1]} columns")
temp = data[:, 0]
prop = data[:, 1]
# Dictionary case - match column names
else:
# Handle temperature column
if isinstance(temp_col, str):
if temp_col in column_names:
temp_idx = column_names.index(temp_col)
else:
raise ValueError(f"Temperature column '{temp_col}' not found in file. "
f"Available columns: {', '.join(column_names)}")
else:
if temp_col >= data.shape[1]:
raise ValueError(f"Temperature column index {temp_col} out of bounds (file has {data.shape[1]} columns)")
temp_idx = temp_col
# Handle property column
if isinstance(prop_col, str):
if prop_col in column_names:
prop_idx = column_names.index(prop_col)
else:
raise ValueError(f"Property column '{prop_col}' not found in file. "
f"Available columns: {', '.join(column_names)}")
else:
if prop_col >= data.shape[1]:
raise ValueError(f"Property column index {prop_col} out of bounds (file has {data.shape[1]} columns)")
prop_idx = prop_col
temp = data[:, temp_idx]
prop = data[:, prop_idx]
else: else:
prop = data[:, 1] # Default to second column # No header
data = np.loadtxt(file_path, dtype=float, skiprows=0)
# Direct path case - check for exactly 2 columns
if direct_path:
if data.shape[1] != 2:
raise ValueError(f"Data should have exactly two columns, but found {data.shape[1]} columns")
temp = data[:, 0]
prop = data[:, 1]
# Dictionary case - use column indices
else:
if isinstance(temp_col, str):
raise ValueError(f"Column name '{temp_col}' specified, but file has no header row")
if isinstance(prop_col, str):
raise ValueError(f"Column name '{prop_col}' specified, but file has no header row")
if temp_col >= data.shape[1]:
raise ValueError(f"Temperature column index {temp_col} out of bounds (file has {data.shape[1]} columns)")
if prop_col >= data.shape[1]:
raise ValueError(f"Property column index {prop_col} out of bounds (file has {data.shape[1]} columns)")
temp = data[:, temp_col]
prop = data[:, prop_col]
# Check for NaN values
if np.any(np.isnan(temp)) or np.any(np.isnan(prop)):
nan_rows = np.where(np.isnan(temp) | np.isnan(prop))[0] + 1
raise ValueError(f"Data contains NaN values in rows: {', '.join(map(str, nan_rows))}")
# Check for duplicate temperatures
unique_temp, counts = np.unique(temp, return_counts=True)
duplicates = unique_temp[counts > 1]
if len(duplicates) > 0:
duplicate_rows = [str(idx + 1) for idx, value in enumerate(temp) if value in duplicates]
raise ValueError(f"Duplicate temperature entries found in rows: {', '.join(duplicate_rows)}")
# Skip the pandas processing below
return temp, prop return temp, prop
# Process pandas DataFrame (for both Excel and CSV) # Process pandas DataFrame (for both Excel and CSV)
# Handle both column name (string) and column index (integer) # Handle both column name (string) and column index (integer)
if isinstance(temp_col, str): if isinstance(temp_col, str):
temp = df[temp_col].to_numpy(dtype=np.float64) if temp_col in df.columns:
temp = df[temp_col].to_numpy(dtype=np.float64)
else:
raise ValueError(f"Temperature column '{temp_col}' not found in file. "
f"Available columns: {', '.join(df.columns)}")
else: else:
if temp_col >= len(df.columns):
raise ValueError(f"Temperature column index {temp_col} out of bounds (file has {len(df.columns)} columns)")
temp = df.iloc[:, temp_col].to_numpy(dtype=np.float64) temp = df.iloc[:, temp_col].to_numpy(dtype=np.float64)
if isinstance(prop_col, str): if isinstance(prop_col, str):
prop = df[prop_col].to_numpy(dtype=np.float64) if prop_col in df.columns:
prop = df[prop_col].to_numpy(dtype=np.float64)
else:
raise ValueError(f"Property column '{prop_col}' not found in file. "
f"Available columns: {', '.join(df.columns)}")
else: else:
if prop_col >= len(df.columns):
raise ValueError(f"Property column index {prop_col} out of bounds (file has {len(df.columns)} columns)")
prop = df.iloc[:, prop_col].to_numpy(dtype=np.float64) prop = df.iloc[:, prop_col].to_numpy(dtype=np.float64)
# Check for NaN values # Check for NaN values
......
import pytest import pytest
import numpy as np import numpy as np
from pymatlib.core.data_handler import ( from pymatlib.core.data_handler import (
read_data, celsius_to_kelvin, fahrenheit_to_kelvin, read_data_from_txt, celsius_to_kelvin, fahrenheit_to_kelvin,
thousand_times thousand_times
) )
...@@ -35,7 +35,7 @@ def test_read_data(tmp_path): ...@@ -35,7 +35,7 @@ def test_read_data(tmp_path):
3.0 30.0 3.0 30.0
""") """)
temp, prop = read_data(str(test_file)) temp, prop = read_data_from_txt(str(test_file))
assert np.allclose(temp, [1.0, 2.0, 3.0]) assert np.allclose(temp, [1.0, 2.0, 3.0])
assert np.allclose(prop, [10.0, 20.0, 30.0]) assert np.allclose(prop, [10.0, 20.0, 30.0])
...@@ -48,7 +48,7 @@ NaN 20.0 ...@@ -48,7 +48,7 @@ NaN 20.0
""") """)
with pytest.raises(ValueError, match="Data contains NaN values in rows: 2"): with pytest.raises(ValueError, match="Data contains NaN values in rows: 2"):
read_data(str(invalid_file)) read_data_from_txt(str(invalid_file))
def test_read_data_errors(tmp_path): def test_read_data_errors(tmp_path):
"""Test error handling in read_data function.""" """Test error handling in read_data function."""
...@@ -77,10 +77,10 @@ NaN 20.0 ...@@ -77,10 +77,10 @@ NaN 20.0
# Test each error case # Test each error case
with pytest.raises(ValueError, match="Data should have exactly two columns"): with pytest.raises(ValueError, match="Data should have exactly two columns"):
temp, prop = read_data(str(wrong_columns)) temp, prop = read_data_from_txt(str(wrong_columns))
with pytest.raises(ValueError, match="Data contains NaN values"): with pytest.raises(ValueError, match="Data contains NaN values"):
temp, prop = read_data(str(nan_file)) temp, prop = read_data_from_txt(str(nan_file))
with pytest.raises(ValueError, match="Duplicate temperature entries found"): with pytest.raises(ValueError, match="Duplicate temperature entries found"):
temp, prop = read_data(str(duplicate_file)) temp, prop = read_data_from_txt(str(duplicate_file))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment