Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
P
pymatlib
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Rahil Doshi
pymatlib
Commits
30b9acc8
Commit
30b9acc8
authored
2 months ago
by
Rahil Doshi
Browse files
Options
Downloads
Plain Diff
Merge branch 'development' into release/v0.1.0
parents
9da883bc
285160dc
No related branches found
No related tags found
No related merge requests found
Pipeline
#74877
passed
2 months ago
Stage: test
Changes
2
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
src/pymatlib/core/data_handler.py
+106
-19
106 additions, 19 deletions
src/pymatlib/core/data_handler.py
tests/python/test_data_handler.py
+6
-6
6 additions, 6 deletions
tests/python/test_data_handler.py
with
112 additions
and
25 deletions
src/pymatlib/core/data_handler.py
+
106
−
19
View file @
30b9acc8
...
@@ -109,24 +109,35 @@ def read_data_from_file(file_config: Union[str, Dict], header: bool = True) -> T
...
@@ -109,24 +109,35 @@ def read_data_from_file(file_config: Union[str, Dict], header: bool = True) -> T
Args:
Args:
file_config: Either a path string or a dictionary containing file configuration
file_config: Either a path string or a dictionary containing file configuration
If dictionary, required keys:
If string (direct path):
- File must have exactly 2 columns
- First column is temperature, second is property
If dictionary:
- file: Path to data file
- file: Path to data file
- temp_col: Temperature column name/index
- temp_col: Temperature column name/index
- prop_col: Property column name/index
- prop_col: Property column name/index
If string, treated as direct file path
header (bool): Indicates if the file contains a header row.
header (bool): Indicates if the file contains a header row.
Returns:
Returns:
Tuple[np.ndarray, np.ndarray]: Temperature and property arrays
Tuple[np.ndarray, np.ndarray]: Temperature and property arrays
Raises:
ValueError: If:
- For direct path: Data doesn
'
t have exactly two columns
- For dictionary config: Specified column names don
'
t match headers
- Data contains NaN values
- Data contains duplicate temperature entries
"""
"""
# Handle string (direct path) or dictionary configuration
# Handle string (direct path) or dictionary configuration
if
isinstance
(
file_config
,
str
):
if
isinstance
(
file_config
,
str
):
file_path
=
file_config
file_path
=
file_config
direct_path
=
True
# For direct file paths, assume first two columns are temperature and property
# For direct file paths, assume first two columns are temperature and property
temp_col
=
0
temp_col
=
0
prop_col
=
1
prop_col
=
1
else
:
else
:
file_path
=
file_config
[
'
file
'
]
file_path
=
file_config
[
'
file
'
]
direct_path
=
False
temp_col
=
file_config
[
'
temp_col
'
]
temp_col
=
file_config
[
'
temp_col
'
]
prop_col
=
file_config
[
'
prop_col
'
]
prop_col
=
file_config
[
'
prop_col
'
]
...
@@ -138,35 +149,111 @@ def read_data_from_file(file_config: Union[str, Dict], header: bool = True) -> T
...
@@ -138,35 +149,111 @@ def read_data_from_file(file_config: Union[str, Dict], header: bool = True) -> T
# Use pandas read_csv for CSV files
# Use pandas read_csv for CSV files
df
=
pd
.
read_csv
(
file_path
,
header
=
0
if
header
else
None
)
df
=
pd
.
read_csv
(
file_path
,
header
=
0
if
header
else
None
)
else
:
else
:
# For txt files, assume columns are space/tab separated
# For txt files
data
=
np
.
loadtxt
(
file_path
,
dtype
=
float
,
skiprows
=
1
if
header
else
0
)
if
header
:
if
data
.
ndim
!=
2
:
# Read the header line to get column names
raise
ValueError
(
"
Data should be two-dimensional
"
)
with
open
(
file_path
,
'
r
'
)
as
f
:
header_line
=
f
.
readline
().
strip
()
# Handle both column name (which would be an index for txt files) and column index
column_names
=
header_line
.
split
()
if
isinstance
(
temp_col
,
int
):
temp
=
data
[:,
temp_col
]
# Now read the data
else
:
data
=
np
.
loadtxt
(
file_path
,
dtype
=
float
,
skiprows
=
1
)
temp
=
data
[:,
0
]
# Default to first column
# Direct path case - check for exactly 2 columns
if
isinstance
(
prop_col
,
int
):
if
direct_path
:
prop
=
data
[:,
prop_col
]
if
data
.
shape
[
1
]
!=
2
:
raise
ValueError
(
f
"
Data should have exactly two columns, but found
{
data
.
shape
[
1
]
}
columns
"
)
temp
=
data
[:,
0
]
prop
=
data
[:,
1
]
# Dictionary case - match column names
else
:
# Handle temperature column
if
isinstance
(
temp_col
,
str
):
if
temp_col
in
column_names
:
temp_idx
=
column_names
.
index
(
temp_col
)
else
:
raise
ValueError
(
f
"
Temperature column
'
{
temp_col
}
'
not found in file.
"
f
"
Available columns:
{
'
,
'
.
join
(
column_names
)
}
"
)
else
:
if
temp_col
>=
data
.
shape
[
1
]:
raise
ValueError
(
f
"
Temperature column index
{
temp_col
}
out of bounds (file has
{
data
.
shape
[
1
]
}
columns)
"
)
temp_idx
=
temp_col
# Handle property column
if
isinstance
(
prop_col
,
str
):
if
prop_col
in
column_names
:
prop_idx
=
column_names
.
index
(
prop_col
)
else
:
raise
ValueError
(
f
"
Property column
'
{
prop_col
}
'
not found in file.
"
f
"
Available columns:
{
'
,
'
.
join
(
column_names
)
}
"
)
else
:
if
prop_col
>=
data
.
shape
[
1
]:
raise
ValueError
(
f
"
Property column index
{
prop_col
}
out of bounds (file has
{
data
.
shape
[
1
]
}
columns)
"
)
prop_idx
=
prop_col
temp
=
data
[:,
temp_idx
]
prop
=
data
[:,
prop_idx
]
else
:
else
:
prop
=
data
[:,
1
]
# Default to second column
# No header
data
=
np
.
loadtxt
(
file_path
,
dtype
=
float
,
skiprows
=
0
)
# Direct path case - check for exactly 2 columns
if
direct_path
:
if
data
.
shape
[
1
]
!=
2
:
raise
ValueError
(
f
"
Data should have exactly two columns, but found
{
data
.
shape
[
1
]
}
columns
"
)
temp
=
data
[:,
0
]
prop
=
data
[:,
1
]
# Dictionary case - use column indices
else
:
if
isinstance
(
temp_col
,
str
):
raise
ValueError
(
f
"
Column name
'
{
temp_col
}
'
specified, but file has no header row
"
)
if
isinstance
(
prop_col
,
str
):
raise
ValueError
(
f
"
Column name
'
{
prop_col
}
'
specified, but file has no header row
"
)
if
temp_col
>=
data
.
shape
[
1
]:
raise
ValueError
(
f
"
Temperature column index
{
temp_col
}
out of bounds (file has
{
data
.
shape
[
1
]
}
columns)
"
)
if
prop_col
>=
data
.
shape
[
1
]:
raise
ValueError
(
f
"
Property column index
{
prop_col
}
out of bounds (file has
{
data
.
shape
[
1
]
}
columns)
"
)
temp
=
data
[:,
temp_col
]
prop
=
data
[:,
prop_col
]
# Check for NaN values
if
np
.
any
(
np
.
isnan
(
temp
))
or
np
.
any
(
np
.
isnan
(
prop
)):
nan_rows
=
np
.
where
(
np
.
isnan
(
temp
)
|
np
.
isnan
(
prop
))[
0
]
+
1
raise
ValueError
(
f
"
Data contains NaN values in rows:
{
'
,
'
.
join
(
map
(
str
,
nan_rows
))
}
"
)
# Check for duplicate temperatures
unique_temp
,
counts
=
np
.
unique
(
temp
,
return_counts
=
True
)
duplicates
=
unique_temp
[
counts
>
1
]
if
len
(
duplicates
)
>
0
:
duplicate_rows
=
[
str
(
idx
+
1
)
for
idx
,
value
in
enumerate
(
temp
)
if
value
in
duplicates
]
raise
ValueError
(
f
"
Duplicate temperature entries found in rows:
{
'
,
'
.
join
(
duplicate_rows
)
}
"
)
# Skip the pandas processing below
return
temp
,
prop
return
temp
,
prop
# Process pandas DataFrame (for both Excel and CSV)
# Process pandas DataFrame (for both Excel and CSV)
# Handle both column name (string) and column index (integer)
# Handle both column name (string) and column index (integer)
if
isinstance
(
temp_col
,
str
):
if
isinstance
(
temp_col
,
str
):
temp
=
df
[
temp_col
].
to_numpy
(
dtype
=
np
.
float64
)
if
temp_col
in
df
.
columns
:
temp
=
df
[
temp_col
].
to_numpy
(
dtype
=
np
.
float64
)
else
:
raise
ValueError
(
f
"
Temperature column
'
{
temp_col
}
'
not found in file.
"
f
"
Available columns:
{
'
,
'
.
join
(
df
.
columns
)
}
"
)
else
:
else
:
if
temp_col
>=
len
(
df
.
columns
):
raise
ValueError
(
f
"
Temperature column index
{
temp_col
}
out of bounds (file has
{
len
(
df
.
columns
)
}
columns)
"
)
temp
=
df
.
iloc
[:,
temp_col
].
to_numpy
(
dtype
=
np
.
float64
)
temp
=
df
.
iloc
[:,
temp_col
].
to_numpy
(
dtype
=
np
.
float64
)
if
isinstance
(
prop_col
,
str
):
if
isinstance
(
prop_col
,
str
):
prop
=
df
[
prop_col
].
to_numpy
(
dtype
=
np
.
float64
)
if
prop_col
in
df
.
columns
:
prop
=
df
[
prop_col
].
to_numpy
(
dtype
=
np
.
float64
)
else
:
raise
ValueError
(
f
"
Property column
'
{
prop_col
}
'
not found in file.
"
f
"
Available columns:
{
'
,
'
.
join
(
df
.
columns
)
}
"
)
else
:
else
:
if
prop_col
>=
len
(
df
.
columns
):
raise
ValueError
(
f
"
Property column index
{
prop_col
}
out of bounds (file has
{
len
(
df
.
columns
)
}
columns)
"
)
prop
=
df
.
iloc
[:,
prop_col
].
to_numpy
(
dtype
=
np
.
float64
)
prop
=
df
.
iloc
[:,
prop_col
].
to_numpy
(
dtype
=
np
.
float64
)
# Check for NaN values
# Check for NaN values
...
...
This diff is collapsed.
Click to expand it.
tests/python/test_data_handler.py
+
6
−
6
View file @
30b9acc8
import
pytest
import
pytest
import
numpy
as
np
import
numpy
as
np
from
pymatlib.core.data_handler
import
(
from
pymatlib.core.data_handler
import
(
read_data
,
celsius_to_kelvin
,
fahrenheit_to_kelvin
,
read_data
_from_txt
,
celsius_to_kelvin
,
fahrenheit_to_kelvin
,
thousand_times
thousand_times
)
)
...
@@ -35,7 +35,7 @@ def test_read_data(tmp_path):
...
@@ -35,7 +35,7 @@ def test_read_data(tmp_path):
3.0 30.0
3.0 30.0
"""
)
"""
)
temp
,
prop
=
read_data
(
str
(
test_file
))
temp
,
prop
=
read_data
_from_txt
(
str
(
test_file
))
assert
np
.
allclose
(
temp
,
[
1.0
,
2.0
,
3.0
])
assert
np
.
allclose
(
temp
,
[
1.0
,
2.0
,
3.0
])
assert
np
.
allclose
(
prop
,
[
10.0
,
20.0
,
30.0
])
assert
np
.
allclose
(
prop
,
[
10.0
,
20.0
,
30.0
])
...
@@ -48,7 +48,7 @@ NaN 20.0
...
@@ -48,7 +48,7 @@ NaN 20.0
"""
)
"""
)
with
pytest
.
raises
(
ValueError
,
match
=
"
Data contains NaN values in rows: 2
"
):
with
pytest
.
raises
(
ValueError
,
match
=
"
Data contains NaN values in rows: 2
"
):
read_data
(
str
(
invalid_file
))
read_data
_from_txt
(
str
(
invalid_file
))
def
test_read_data_errors
(
tmp_path
):
def
test_read_data_errors
(
tmp_path
):
"""
Test error handling in read_data function.
"""
"""
Test error handling in read_data function.
"""
...
@@ -77,10 +77,10 @@ NaN 20.0
...
@@ -77,10 +77,10 @@ NaN 20.0
# Test each error case
# Test each error case
with
pytest
.
raises
(
ValueError
,
match
=
"
Data should have exactly two columns
"
):
with
pytest
.
raises
(
ValueError
,
match
=
"
Data should have exactly two columns
"
):
temp
,
prop
=
read_data
(
str
(
wrong_columns
))
temp
,
prop
=
read_data
_from_txt
(
str
(
wrong_columns
))
with
pytest
.
raises
(
ValueError
,
match
=
"
Data contains NaN values
"
):
with
pytest
.
raises
(
ValueError
,
match
=
"
Data contains NaN values
"
):
temp
,
prop
=
read_data
(
str
(
nan_file
))
temp
,
prop
=
read_data
_from_txt
(
str
(
nan_file
))
with
pytest
.
raises
(
ValueError
,
match
=
"
Duplicate temperature entries found
"
):
with
pytest
.
raises
(
ValueError
,
match
=
"
Duplicate temperature entries found
"
):
temp
,
prop
=
read_data
(
str
(
duplicate_file
))
temp
,
prop
=
read_data
_from_txt
(
str
(
duplicate_file
))
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment