Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
pystencils
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
pycodegen
pystencils
Commits
d38aed46
Commit
d38aed46
authored
5 years ago
by
Michael Kuron
Browse files
Options
Downloads
Patches
Plain Diff
AES-NI RNG
parent
7750926f
No related branches found
No related tags found
1 merge request
!30
AES-NI Random Number Generator
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
pystencils/include/aesni_rand.h
+66
-0
66 additions, 0 deletions
pystencils/include/aesni_rand.h
pystencils/rng.py
+37
-19
37 additions, 19 deletions
pystencils/rng.py
pystencils_tests/test_random.py
+47
-1
47 additions, 1 deletion
pystencils_tests/test_random.py
with
150 additions
and
20 deletions
pystencils/include/aesni_rand.h
0 → 100644
+
66
−
0
View file @
d38aed46
#if !defined(__AES__) || !defined(__SSE2__)
#error AES-NI and SSE2 need to be enabled
#endif
#include
<x86intrin.h>
#include
<cstdint>
#define QUALIFIERS inline
#define TWOPOW53_INV_DOUBLE (1.1102230246251565e-16)
#define TWOPOW32_INV_FLOAT (2.3283064e-10f)
typedef
std
::
uint32_t
uint32
;
typedef
std
::
uint64_t
uint64
;
QUALIFIERS
__m128i
aesni1xm128i
(
const
__m128i
&
in
,
const
__m128i
&
k
)
{
__m128i
x
=
_mm_xor_si128
(
k
,
in
);
x
=
_mm_aesenc_si128
(
x
,
k
);
x
=
_mm_aesenc_si128
(
x
,
k
);
x
=
_mm_aesenc_si128
(
x
,
k
);
x
=
_mm_aesenc_si128
(
x
,
k
);
x
=
_mm_aesenc_si128
(
x
,
k
);
x
=
_mm_aesenc_si128
(
x
,
k
);
x
=
_mm_aesenc_si128
(
x
,
k
);
x
=
_mm_aesenc_si128
(
x
,
k
);
x
=
_mm_aesenc_si128
(
x
,
k
);
x
=
_mm_aesenclast_si128
(
x
,
k
);
return
x
;
}
QUALIFIERS
double
_uniform_double_hq
(
uint32
x
,
uint32
y
)
{
uint64
z
=
(
uint64
)
x
^
((
uint64
)
y
<<
(
53
-
32
));
return
z
*
TWOPOW53_INV_DOUBLE
+
(
TWOPOW53_INV_DOUBLE
/
2
.
0
);
}
QUALIFIERS
void
aesni_double2
(
uint32
ctr0
,
uint32
ctr1
,
uint32
ctr2
,
uint32
ctr3
,
uint32
key0
,
uint32
key1
,
uint32
key2
,
uint32
key3
,
double
&
rnd1
,
double
&
rnd2
)
{
__m128i
c128
=
_mm_set_epi32
(
ctr3
,
ctr2
,
ctr1
,
ctr0
);
__m128i
k128
=
_mm_set_epi32
(
key3
,
key2
,
key1
,
key0
);
c128
=
aesni1xm128i
(
c128
,
k128
);
uint32
r
[
4
];
_mm_storeu_si128
((
__m128i
*
)
&
r
[
0
],
c128
);
rnd1
=
_uniform_double_hq
(
r
[
0
],
r
[
1
]);
rnd2
=
_uniform_double_hq
(
r
[
2
],
r
[
3
]);
}
QUALIFIERS
void
aesni_float4
(
uint32
ctr0
,
uint32
ctr1
,
uint32
ctr2
,
uint32
ctr3
,
uint32
key0
,
uint32
key1
,
uint32
key2
,
uint32
key3
,
float
&
rnd1
,
float
&
rnd2
,
float
&
rnd3
,
float
&
rnd4
)
{
__m128i
c128
=
_mm_set_epi32
(
ctr3
,
ctr2
,
ctr1
,
ctr0
);
__m128i
k128
=
_mm_set_epi32
(
key3
,
key2
,
key1
,
key0
);
c128
=
aesni1xm128i
(
c128
,
k128
);
uint32
r
[
4
];
_mm_storeu_si128
((
__m128i
*
)
&
r
[
0
],
c128
);
rnd1
=
r
[
0
]
*
TWOPOW32_INV_FLOAT
+
(
TWOPOW32_INV_FLOAT
/
2
.
0
f
);
rnd2
=
r
[
1
]
*
TWOPOW32_INV_FLOAT
+
(
TWOPOW32_INV_FLOAT
/
2
.
0
f
);
rnd3
=
r
[
2
]
*
TWOPOW32_INV_FLOAT
+
(
TWOPOW32_INV_FLOAT
/
2
.
0
f
);
rnd4
=
r
[
3
]
*
TWOPOW32_INV_FLOAT
+
(
TWOPOW32_INV_FLOAT
/
2
.
0
f
);
}
\ No newline at end of file
This diff is collapsed.
Click to expand it.
pystencils/rng.py
+
37
−
19
View file @
d38aed46
...
@@ -6,7 +6,7 @@ from pystencils.astnodes import LoopOverCoordinate
...
@@ -6,7 +6,7 @@ from pystencils.astnodes import LoopOverCoordinate
from
pystencils.backends.cbackend
import
CustomCodeNode
from
pystencils.backends.cbackend
import
CustomCodeNode
def
_get_
philox
_template
(
data_type
,
num_vars
):
def
_get_
rng
_template
(
name
,
data_type
,
num_vars
):
if
data_type
is
np
.
float32
:
if
data_type
is
np
.
float32
:
c_type
=
"
float
"
c_type
=
"
float
"
elif
data_type
is
np
.
float64
:
elif
data_type
is
np
.
float64
:
...
@@ -14,20 +14,14 @@ def _get_philox_template(data_type, num_vars):
...
@@ -14,20 +14,14 @@ def _get_philox_template(data_type, num_vars):
template
=
"
\n
"
template
=
"
\n
"
for
i
in
range
(
num_vars
):
for
i
in
range
(
num_vars
):
template
+=
"
{{result_symbols[{}].dtype}} {{result_symbols[{}].name}};
\n
"
.
format
(
i
,
i
)
template
+=
"
{{result_symbols[{}].dtype}} {{result_symbols[{}].name}};
\n
"
.
format
(
i
,
i
)
template
+=
(
"
philox
_{}{}({{parameters}},
"
+
"
,
"
.
join
([
"
{{result_symbols[{}].name}}
"
]
*
num_vars
)
+
"
);
\n
"
)
\
template
+=
(
"
{}
_{}{}({{parameters}},
"
+
"
,
"
.
join
([
"
{{result_symbols[{}].name}}
"
]
*
num_vars
)
+
"
);
\n
"
)
\
.
format
(
c_type
,
num_vars
,
*
tuple
(
range
(
num_vars
)))
.
format
(
name
,
c_type
,
num_vars
,
*
tuple
(
range
(
num_vars
)))
return
template
return
template
def
_get_
philox
_code
(
template
,
dialect
,
vector_instruction_set
,
time_step
,
offsets
,
keys
,
dim
,
result_symbols
):
def
_get_
rng
_code
(
template
,
dialect
,
vector_instruction_set
,
time_step
,
offsets
,
keys
,
dim
,
result_symbols
):
parameters
=
[
time_step
]
+
[
LoopOverCoordinate
.
get_loop_counter_symbol
(
i
)
+
offsets
[
i
]
parameters
=
[
time_step
]
+
[
LoopOverCoordinate
.
get_loop_counter_symbol
(
i
)
+
offsets
[
i
]
for
i
in
range
(
dim
)]
+
list
(
keys
)
for
i
in
range
(
dim
)]
+
[
0
]
*
(
3
-
dim
)
+
list
(
keys
)
while
len
(
parameters
)
<
6
:
parameters
.
append
(
0
)
parameters
=
parameters
[:
6
]
assert
len
(
parameters
)
==
6
if
dialect
==
'
cuda
'
or
(
dialect
==
'
c
'
and
vector_instruction_set
is
None
):
if
dialect
==
'
cuda
'
or
(
dialect
==
'
c
'
and
vector_instruction_set
is
None
):
return
template
.
format
(
parameters
=
'
,
'
.
join
(
str
(
p
)
for
p
in
parameters
),
return
template
.
format
(
parameters
=
'
,
'
.
join
(
str
(
p
)
for
p
in
parameters
),
...
@@ -36,15 +30,21 @@ def _get_philox_code(template, dialect, vector_instruction_set, time_step, offse
...
@@ -36,15 +30,21 @@ def _get_philox_code(template, dialect, vector_instruction_set, time_step, offse
raise
NotImplementedError
(
"
Not yet implemented for this backend
"
)
raise
NotImplementedError
(
"
Not yet implemented for this backend
"
)
class
Philox
Base
(
CustomCodeNode
):
class
RNG
Base
(
CustomCodeNode
):
def
__init__
(
self
,
dim
,
time_step
=
TypedSymbol
(
"
time_step
"
,
np
.
uint32
),
offsets
=
(
0
,
0
,
0
),
keys
=
(
0
,
0
)):
def
__init__
(
self
,
dim
,
time_step
=
TypedSymbol
(
"
time_step
"
,
np
.
uint32
),
offsets
=
(
0
,
0
,
0
),
keys
=
None
):
if
keys
is
None
:
keys
=
(
0
,)
*
self
.
_num_keys
if
len
(
keys
)
!=
self
.
_num_keys
:
raise
ValueError
(
"
Provided {} keys but need {}
"
.
format
(
len
(
keys
),
self
.
_num_keys
))
if
len
(
offsets
)
!=
3
:
raise
ValueError
(
"
Provided {} offsets but need {}
"
.
format
(
len
(
offsets
),
3
))
self
.
result_symbols
=
tuple
(
TypedSymbol
(
sp
.
Dummy
().
name
,
self
.
_data_type
)
for
_
in
range
(
self
.
_num_vars
))
self
.
result_symbols
=
tuple
(
TypedSymbol
(
sp
.
Dummy
().
name
,
self
.
_data_type
)
for
_
in
range
(
self
.
_num_vars
))
symbols_read
=
[
s
for
s
in
keys
if
isinstance
(
s
,
sp
.
Symbol
)]
symbols_read
=
[
s
for
s
in
keys
if
isinstance
(
s
,
sp
.
Symbol
)]
super
().
__init__
(
""
,
symbols_read
=
symbols_read
,
symbols_defined
=
self
.
result_symbols
)
super
().
__init__
(
""
,
symbols_read
=
symbols_read
,
symbols_defined
=
self
.
result_symbols
)
self
.
_time_step
=
time_step
self
.
_time_step
=
time_step
self
.
_offsets
=
offsets
self
.
_offsets
=
offsets
self
.
headers
=
[
'"
philox
_rand.h
"'
]
self
.
headers
=
[
'"
{}
_rand.h
"'
.
format
(
self
.
_name
)
]
self
.
keys
=
tuple
(
keys
)
self
.
keys
=
tuple
(
keys
)
self
.
_args
=
sp
.
sympify
((
dim
,
time_step
,
keys
))
self
.
_args
=
sp
.
sympify
((
dim
,
time_step
,
keys
))
self
.
_dim
=
dim
self
.
_dim
=
dim
...
@@ -65,22 +65,40 @@ class PhiloxBase(CustomCodeNode):
...
@@ -65,22 +65,40 @@ class PhiloxBase(CustomCodeNode):
return
self
# nothing to replace inside this node - would destroy intermediate "dummy" by re-creating them
return
self
# nothing to replace inside this node - would destroy intermediate "dummy" by re-creating them
def
get_code
(
self
,
dialect
,
vector_instruction_set
):
def
get_code
(
self
,
dialect
,
vector_instruction_set
):
template
=
_get_
philox
_template
(
self
.
_data_type
,
self
.
_num_vars
)
template
=
_get_
rng
_template
(
self
.
_name
,
self
.
_data_type
,
self
.
_num_vars
)
return
_get_
philox
_code
(
template
,
dialect
,
vector_instruction_set
,
return
_get_
rng
_code
(
template
,
dialect
,
vector_instruction_set
,
self
.
_time_step
,
self
.
_offsets
,
self
.
keys
,
self
.
_dim
,
self
.
result_symbols
)
self
.
_time_step
,
self
.
_offsets
,
self
.
keys
,
self
.
_dim
,
self
.
result_symbols
)
def
__repr__
(
self
):
def
__repr__
(
self
):
return
(
"
,
"
.
join
([
'
{}
'
]
*
self
.
_num_vars
)
+
"
<- PhiloxRNG
"
).
format
(
*
self
.
result_symbols
)
return
(
"
,
"
.
join
([
'
{}
'
]
*
self
.
_num_vars
)
+
"
<- {}RNG
"
).
format
(
*
self
.
result_symbols
,
self
.
_name
.
capitalize
())
class
PhiloxTwoDoubles
(
RNGBase
):
_name
=
"
philox
"
_data_type
=
np
.
float64
_num_vars
=
2
_num_keys
=
2
class
PhiloxFourFloats
(
RNGBase
):
_name
=
"
philox
"
_data_type
=
np
.
float32
_num_vars
=
4
_num_keys
=
2
class
PhiloxTwoDoubles
(
PhiloxBase
):
class
AESNITwoDoubles
(
RNGBase
):
_name
=
"
aesni
"
_data_type
=
np
.
float64
_data_type
=
np
.
float64
_num_vars
=
2
_num_vars
=
2
_num_keys
=
4
class
PhiloxFourFloats
(
PhiloxBase
):
class
AESNIFourFloats
(
RNGBase
):
_name
=
"
aesni
"
_data_type
=
np
.
float32
_data_type
=
np
.
float32
_num_vars
=
4
_num_vars
=
4
_num_keys
=
4
def
random_symbol
(
assignment_list
,
seed
=
TypedSymbol
(
"
seed
"
,
np
.
uint32
),
rng_node
=
PhiloxTwoDoubles
,
*
args
,
**
kwargs
):
def
random_symbol
(
assignment_list
,
seed
=
TypedSymbol
(
"
seed
"
,
np
.
uint32
),
rng_node
=
PhiloxTwoDoubles
,
*
args
,
**
kwargs
):
...
...
This diff is collapsed.
Click to expand it.
pystencils_tests/test_random.py
+
47
−
1
View file @
d38aed46
import
numpy
as
np
import
numpy
as
np
import
pystencils
as
ps
import
pystencils
as
ps
from
pystencils.rng
import
PhiloxFourFloats
,
PhiloxTwoDoubles
from
pystencils.rng
import
PhiloxFourFloats
,
PhiloxTwoDoubles
,
AESNIFourFloats
,
AESNITwoDoubles
# curand_Philox4x32_10(make_uint4(124, i, j, 0), make_uint2(0, 0))
# curand_Philox4x32_10(make_uint4(124, i, j, 0), make_uint2(0, 0))
...
@@ -56,3 +56,49 @@ def test_philox_float():
...
@@ -56,3 +56,49 @@ def test_philox_float():
float_reference
=
philox_reference
*
2.
**-
32
+
2.
**-
33
float_reference
=
philox_reference
*
2.
**-
32
+
2.
**-
33
assert
(
np
.
allclose
(
arr
,
float_reference
,
rtol
=
0
,
atol
=
np
.
finfo
(
np
.
float32
).
eps
))
assert
(
np
.
allclose
(
arr
,
float_reference
,
rtol
=
0
,
atol
=
np
.
finfo
(
np
.
float32
).
eps
))
def
test_aesni_double
():
dh
=
ps
.
create_data_handling
((
2
,
2
),
default_ghost_layers
=
0
,
default_target
=
"
cpu
"
)
f
=
dh
.
add_array
(
"
f
"
,
values_per_cell
=
2
)
dh
.
fill
(
'
f
'
,
42.0
)
aesni_node
=
AESNITwoDoubles
(
dh
.
dim
)
assignments
=
[
aesni_node
,
ps
.
Assignment
(
f
(
0
),
aesni_node
.
result_symbols
[
0
]),
ps
.
Assignment
(
f
(
1
),
aesni_node
.
result_symbols
[
1
])]
kernel
=
ps
.
create_kernel
(
assignments
,
target
=
dh
.
default_target
).
compile
()
dh
.
all_to_gpu
()
dh
.
run_kernel
(
kernel
,
time_step
=
124
)
dh
.
all_to_cpu
()
arr
=
dh
.
gather_array
(
'
f
'
)
assert
np
.
logical_and
(
arr
<=
1.0
,
arr
>=
0
).
all
()
#x = aesni_reference[:,:,0::2]
#y = aesni_reference[:,:,1::2]
#z = x ^ y << (53 - 32)
#double_reference = z * 2.**-53 + 2.**-54
#assert(np.allclose(arr, double_reference, rtol=0, atol=np.finfo(np.float64).eps))
def
test_aesni_float
():
dh
=
ps
.
create_data_handling
((
2
,
2
),
default_ghost_layers
=
0
,
default_target
=
"
cpu
"
)
f
=
dh
.
add_array
(
"
f
"
,
values_per_cell
=
4
)
dh
.
fill
(
'
f
'
,
42.0
)
aesni_node
=
AESNIFourFloats
(
dh
.
dim
)
assignments
=
[
aesni_node
]
+
[
ps
.
Assignment
(
f
(
i
),
aesni_node
.
result_symbols
[
i
])
for
i
in
range
(
4
)]
kernel
=
ps
.
create_kernel
(
assignments
,
target
=
dh
.
default_target
).
compile
()
dh
.
all_to_gpu
()
dh
.
run_kernel
(
kernel
,
time_step
=
124
)
dh
.
all_to_cpu
()
arr
=
dh
.
gather_array
(
'
f
'
)
assert
np
.
logical_and
(
arr
<=
1.0
,
arr
>=
0
).
all
()
print
(
arr
)
#float_reference = aesni_reference * 2.**-32 + 2.**-33
#assert(np.allclose(arr, float_reference, rtol=0, atol=np.finfo(np.float32).eps))
\ No newline at end of file
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment