diff --git a/python/waLBerla/tools/config/distributeProblemBetweenFixedDomain.py b/python/waLBerla/tools/config/distributeProblemBetweenFixedDomain.py index 1e504841f60ed30daaa46c4842f88f177a396c44..b50947122423621cfda7f82fec738aa2e081adc9 100644 --- a/python/waLBerla/tools/config/distributeProblemBetweenFixedDomain.py +++ b/python/waLBerla/tools/config/distributeProblemBetweenFixedDomain.py @@ -3,8 +3,11 @@ from math import floor, ceil, log2, prod from typing import Tuple, List import numpy as np + # FIXME: Probably discard this method as it is not helpful to find problem sizes unequal the initial one. -def partition_domain_approximate_N(N, Dx, Dy, Dz, verbose: bool = False) -> Tuple[int, int, int]: +def partition_domain_approximate_N( + N, Dx, Dy, Dz, verbose: bool = False +) -> Tuple[int, int, int]: """ Partitions N data points among a 3D domain split into Dx, Dy, Dz subdomains. @@ -28,14 +31,16 @@ def partition_domain_approximate_N(N, Dx, Dy, Dz, verbose: bool = False) -> Tupl N_sub = N / D_total # This may not be an integer # Compute approximate cube root distribution - cube_root = (N_sub) ** (1/3) + cube_root = (N_sub) ** (1 / 3) n = np.array([max(1, round(cube_root * (d / D_sum))) for d in D]) # Adjust for rounding errors to ensure total points remain close to N total_points = np.prod(n * D) if verbose: - print(f"ESTIMATE: I want to have {N} points, but I get {total_points} points. So a difference of {N-total_points} points.") + print( + f"ESTIMATE: I want to have {N} points, but I get {total_points} points. So a difference of {N-total_points} points." + ) while total_points > N: # Reduce the largest dimension slightly to fit within N @@ -44,15 +49,32 @@ def partition_domain_approximate_N(N, Dx, Dy, Dz, verbose: bool = False) -> Tupl total_points = np.prod(n * D) if verbose: - print(f"REDUCE: I want to have {N} points, but I get {total_points} points. So a difference of {N-total_points} points.") + print( + f"REDUCE: I want to have {N} points, but I get {total_points} points. So a difference of {N-total_points} points." + ) stay = True while stay and total_points < N: # Increase a dimension to reach N sorted_indices = np.argsort(n) - condition0 = D_total * n[sorted_indices[1]] * n[sorted_indices[2]] * (n[sorted_indices[0]] + 1) - condition1 = D_total * n[sorted_indices[0]] * n[sorted_indices[2]] * (n[sorted_indices[1]] + 1) - condition2 = D_total * n[sorted_indices[0]] * n[sorted_indices[1]] * (n[sorted_indices[2]] + 1) + condition0 = ( + D_total + * n[sorted_indices[1]] + * n[sorted_indices[2]] + * (n[sorted_indices[0]] + 1) + ) + condition1 = ( + D_total + * n[sorted_indices[0]] + * n[sorted_indices[2]] + * (n[sorted_indices[1]] + 1) + ) + condition2 = ( + D_total + * n[sorted_indices[0]] + * n[sorted_indices[1]] + * (n[sorted_indices[2]] + 1) + ) if condition0 <= N: n[sorted_indices[0]] += 1 elif condition1 <= N: @@ -64,17 +86,23 @@ def partition_domain_approximate_N(N, Dx, Dy, Dz, verbose: bool = False) -> Tupl total_points = np.prod(n * D) if verbose: - print(f"FILL-UP: I want to have {N} points, but I get {total_points} points. So a difference of {N-total_points} points.") + print( + f"FILL-UP: I want to have {N} points, but I get {total_points} points. So a difference of {N-total_points} points." + ) - if N-total_points != 0: - print(f"The domain could not ideally be decomposed.\nThere are {N-total_points} less then requested.") + if N - total_points != 0: + print( + f"The domain could not ideally be decomposed.\nThere are {N-total_points} less then requested." + ) - n = tuple(sorted(n, reverse=True)) #descending order + n = tuple(sorted(n, reverse=True)) # descending order return int(n[0]), int(n[1]), int(n[2]) -def suggest_scaling_appropriate_sizes_around_N(N: int, search_range: int = 2) -> Tuple[int]: +def suggest_scaling_appropriate_sizes_around_N( + N: int, search_range: int = 2 +) -> Tuple[int]: """ Provides a set of numbers around target number `N` that are especially suited for scaling runs, in descending order. For scaling, it is favorable multiples of powers of two, to bisect your problem as often as possible. @@ -94,25 +122,26 @@ def suggest_scaling_appropriate_sizes_around_N(N: int, search_range: int = 2) -> Returns: Tuple[int]: List of candidate problem sizes. """ - first_candidate = 2**max(1, floor(log2(N)) - search_range) - candidates = [first_candidate * factor for factor in range(1, 2**(search_range+1))] - + first_candidate = 2 ** max(1, floor(log2(N)) - search_range) + candidates = [ + first_candidate * factor for factor in range(1, 2 ** (search_range + 1)) + ] return tuple(reversed(candidates)) def suggest_best_3d_problem_decompositions_for_initial_single_node_performance_test( - max_number_of_nodes: int, - processes_per_node: int, - memory: float, - precision: int, - size_q: int, - safety_factor: float, - name: str, - prioritize_balanced_decompositions: bool = False, - max_number_of_candidates: int = 6, - min_distance_between_candidates: float = 2.0, - debug: bool = False, + max_number_of_nodes: int, + processes_per_node: int, + memory: float, + precision: int, + size_q: int, + safety_factor: float, + name: str, + prioritize_balanced_decompositions: bool = False, + max_number_of_candidates: int = 6, + min_distance_between_candidates: float = 2.0, + debug: bool = False, ) -> Tuple[np.ndarray, np.ndarray]: """ Suggests a set of 3D problem decompositions that are especially suited for single node performance tests. @@ -147,7 +176,10 @@ def suggest_best_3d_problem_decompositions_for_initial_single_node_performance_t # - create all possible combinations of candidates, i.e., the number of combinations is binomial_coefficient(k=3, n=len(n_j_candidates)) # decomposition_candidates = np.argsort(np.array(combinations_with_replacement(n_j_candidates, 3)), axis=1) - list_of_scaling_processes = processes_per_node * 2**np.arange(ceil(log2(max_number_of_nodes+1))) + list_of_scaling_processes = processes_per_node * 2 ** np.arange( + ceil(log2(max_number_of_nodes + 1)) + ) + def valid_candidate_generator(): # - check which candidate combinations suffice the memory limit # - check if I can bisect my problem until P_max @@ -165,11 +197,15 @@ def suggest_best_3d_problem_decompositions_for_initial_single_node_performance_t # [candidate for _, candidate in zip(range(max_number_of_candidates), valid_candidate_generator())] # ) valid_n_j_candidates = np.array(list(valid_candidate_generator())) - reordered_candidates_for_scaling = valid_n_j_candidates[np.argsort(np.ptp(valid_n_j_candidates, axis=1))] + reordered_candidates_for_scaling = valid_n_j_candidates[ + np.argsort(np.ptp(valid_n_j_candidates, axis=1)) + ] # valid_candidates_for_scaling = valid_n_j_candidates[np.argsort(np.prod(valid_n_j_candidates, axis=1))[::-1][:max_number_of_candidates]] - def thin_out_candidates(candidates: np.ndarray, min_distance_between_candidates: float) -> np.ndarray: + def thin_out_candidates( + candidates: np.ndarray, min_distance_between_candidates: float + ) -> np.ndarray: """ Thin out candidates by removing those that are too close to each other based on the min_distance_between_candidates. Prioritize keeping more equally distributed candidates. @@ -177,78 +213,179 @@ def suggest_best_3d_problem_decompositions_for_initial_single_node_performance_t # FIXME what I actually want is not to have a relative change from one to the other. I want the memory utilization to be uniformly distributed or rather normally with the peak at the max utilization. thinned_candidates = [] for candidate in candidates: - if all(np.prod(candidate) / np.prod(existing) > min_distance_between_candidates or np.prod(existing) / np.prod(candidate) > min_distance_between_candidates for existing in thinned_candidates): + if all( + np.prod(candidate) / np.prod(existing) > min_distance_between_candidates + or np.prod(existing) / np.prod(candidate) + > min_distance_between_candidates + for existing in thinned_candidates + ): thinned_candidates.append(candidate) return np.array(thinned_candidates) # Thin out valid_candidates_for_scaling - reordered_candidates_for_scaling = thin_out_candidates(reordered_candidates_for_scaling, min_distance_between_candidates) + reordered_candidates_for_scaling = thin_out_candidates( + reordered_candidates_for_scaling, min_distance_between_candidates + ) # Reorder the thinned candidates to prioritize balance # - discard all but the first `max_number_of_candidates` combinations which product is closest to N_max - valid_candidates_for_scaling = reordered_candidates_for_scaling[np.argsort(np.prod(reordered_candidates_for_scaling, axis=1))[::-1][:max_number_of_candidates]] - reordered_candidates_for_scaling = valid_candidates_for_scaling[np.argsort(np.ptp(valid_candidates_for_scaling, axis=1))] + valid_candidates_for_scaling = reordered_candidates_for_scaling[ + np.argsort(np.prod(reordered_candidates_for_scaling, axis=1))[::-1][ + :max_number_of_candidates + ] + ] + reordered_candidates_for_scaling = valid_candidates_for_scaling[ + np.argsort(np.ptp(valid_candidates_for_scaling, axis=1)) + ] # - sort them to be as equally distributed as possible # reordered_candidates_for_scaling = valid_candidates_for_scaling[np.argsort(np.ptp(valid_candidates_for_scaling, axis=1))] - reordered_candidates_for_scaling = valid_candidates_for_scaling[np.argsort(np.ptp(valid_candidates_for_scaling, axis=1))] + reordered_candidates_for_scaling = valid_candidates_for_scaling[ + np.argsort(np.ptp(valid_candidates_for_scaling, axis=1)) + ] # reordered_candidates_for_scaling = valid_n_j_candidates[np.argsort(np.ptp(valid_n_j_candidates, axis=1))[:max_number_of_candidates]] # - denote if P_max can be included or not - index_of_scaling_candidates_that_allow_to_include_P_max = np.where(np.prod(valid_candidates_for_scaling, axis=1) % (processes_per_node * max_number_of_nodes) == 0)[0] - index_of_scaling_candidates_that_allow_to_include_P_max_reordered = np.where(np.prod(reordered_candidates_for_scaling, axis=1) % (processes_per_node * max_number_of_nodes) == 0)[0] - + index_of_scaling_candidates_that_allow_to_include_P_max = np.where( + np.prod(valid_candidates_for_scaling, axis=1) + % (processes_per_node * max_number_of_nodes) + == 0 + )[0] + index_of_scaling_candidates_that_allow_to_include_P_max_reordered = np.where( + np.prod(reordered_candidates_for_scaling, axis=1) + % (processes_per_node * max_number_of_nodes) + == 0 + )[0] # DEBUG block if debug: relation_factor_n = np.max(valid_candidates_for_scaling[0]) - print("\n", 50*"-", "\nInformation for best suited candidate on ", name) + print("\n", 50 * "-", "\nInformation for best suited candidate on ", name) print(f"It is possible to allocate a total of N={N_max:.3e} cells.") print(f"It is possible to allocate a total of n={n_max:.3e} cells per process.") if not prioritize_balanced_decompositions: - utilization = (processes_per_node * np.prod(valid_candidates_for_scaling, axis=1)) / N_max - unbalance_factor = np.maximum(0.01, np.ptp(valid_candidates_for_scaling, axis=1) / relation_factor_n) + utilization = ( + processes_per_node * np.prod(valid_candidates_for_scaling, axis=1) + ) / N_max + unbalance_factor = np.maximum( + 0.01, np.ptp(valid_candidates_for_scaling, axis=1) / relation_factor_n + ) # size_order_score = utilization / unbalance_factor - print("Utilization [without reordering] is ", utilization, " %." ) - print("Unbalance factor [without reordering] is ", unbalance_factor, " %." ) + # print("Utilization [without reordering] is ", utilization, " %.") + # print("Unbalance factor [without reordering] is ", unbalance_factor, " %.") # print("Total score [without reordering] is ", size_order_score, "." ) - print("All available nodes can be included for candidates of index [without reordering]: ", index_of_scaling_candidates_that_allow_to_include_P_max) - print(*[f"{i:2}: {tuple(elem)},\t# N={processes_per_node * np.prod(elem):.3e} ~ {util:5.1%} | {balance:5.1%}" for i, (util, balance, elem) in enumerate(zip(utilization, unbalance_factor, valid_candidates_for_scaling))], sep='\n') + # print( + # "All available nodes can be included for candidates of index [without reordering]: ", + # index_of_scaling_candidates_that_allow_to_include_P_max, + # ) + print( + *[ + f"""{i:2}: {tuple(elem)},\t""" + f"""# N={processes_per_node * np.prod(elem):.3e} ~ {util:5.1%} | """ + f"""{balance:5.1%}""" + f"""\t{'# ' + str(max_number_of_nodes) + ' nodes included' if i in index_of_scaling_candidates_that_allow_to_include_P_max else ''}""" + for i, (util, balance, elem) in enumerate( + zip(utilization, unbalance_factor, valid_candidates_for_scaling) + ) + ], + sep="\n", + ) elif prioritize_balanced_decompositions: - utilization_reordered = (processes_per_node * np.prod(reordered_candidates_for_scaling, axis=1)) / N_max - unbalance_factor_reordered = np.maximum(0.01, np.ptp(reordered_candidates_for_scaling, axis=1) / relation_factor_n) + utilization_reordered = ( + processes_per_node * np.prod(reordered_candidates_for_scaling, axis=1) + ) / N_max + unbalance_factor_reordered = np.maximum( + 0.01, + np.ptp(reordered_candidates_for_scaling, axis=1) / relation_factor_n, + ) # balance_order_score = utilization_reordered / unbalance_factor_reordered - print("Utilization [with reordering] is ", utilization_reordered, " %." ) - print("Unbalance factor [with reordering] is ", unbalance_factor_reordered, " %.") + # print( + # "Utilization [with reordering] is ", + # utilization_reordered, + # " %.", + # ) + # print( + # "Unbalance factor [with reordering] is ", + # unbalance_factor_reordered, + # " %.", + # ) # print("Total score [with reordering] is ", balance_order_score, "." ) - print("All available nodes can be included for candidates of index [with reordering]: ", index_of_scaling_candidates_that_allow_to_include_P_max_reordered) - print(*[f"{i:2}: {tuple(elem)},\t# N={processes_per_node * np.prod(elem):.3e} ~ {util:5.1%} | {balance:5.1%}" for i, (util, balance, elem) in enumerate(zip(utilization_reordered, unbalance_factor_reordered, reordered_candidates_for_scaling))], sep='\n') - print("Use 3-5 of these candidates for single node performance tests. Then use the best one for scaling.") - - # TODO create functions out of the sub-steps + # print( + # "All available nodes can be included for candidates of index [with reordering]: ", + # index_of_scaling_candidates_that_allow_to_include_P_max_reordered, + # ) + print( + *[ + f"""{i:2}: {tuple(elem)},\t""" + f"""# N={processes_per_node * np.prod(elem):.3e} ~ {util:5.1%} | """ + f"""{balance:5.1%}""" + f"""\t{'# ' + str(max_number_of_nodes) + ' nodes included' if i in index_of_scaling_candidates_that_allow_to_include_P_max_reordered else ''}""" + for i, (util, balance, elem) in enumerate( + zip( + utilization_reordered, + unbalance_factor_reordered, + reordered_candidates_for_scaling, + ) + ) + ], + sep="\n", + ) + print( + "Use 3-5 of these candidates for single node performance tests. Then use the best one for scaling." + ) + # 100 nodes included + # TODO create functions out of the sub-steps if prioritize_balanced_decompositions: - return reordered_candidates_for_scaling, index_of_scaling_candidates_that_allow_to_include_P_max_reordered + return ( + reordered_candidates_for_scaling, + index_of_scaling_candidates_that_allow_to_include_P_max_reordered, + ) else: - return valid_candidates_for_scaling, index_of_scaling_candidates_that_allow_to_include_P_max - + return ( + valid_candidates_for_scaling, + index_of_scaling_candidates_that_allow_to_include_P_max, + ) if __name__ == "__main__": # Example usage - for max_number_of_nodes, processes_per_node, memory, precision, size_q, safety_factor, name in [ + for ( + max_number_of_nodes, + processes_per_node, + memory, + precision, + size_q, + safety_factor, + name, + ) in [ # ( 100, 112, 2.56e11, 8, 27, 1.4, "MareNostrum5-GPP"), # ( 100, 1, 6.40e10, 8, 19, 1.2, "MareNostrum5-ACC"), # NOTE: you have 4 times the GPUs but each one has it's own memory + # ( 100, 1, 6.40e10, 8, 27, 1.2, "MareNostrum5-ACC"), # NOTE: you have 4 times the GPUs but each one has it's own memory + # ( 32, 48, 3.20e10, 8, 27, 1.4, "MareNostrum5-ACC"), # NOTE: you have 4 times the GPUs but each one has it's own memory # ( 512, 128, 2.56e11, 8, 27, 1.4, "Q27-LUMI-C"), # ( 512, 128, 2.56e11, 8, 19, 1.4, "Q19-LUMI-C"), # ( 512, 128, 2.56e11, 8, 15, 1.4, "Q15-LUMI-C"), # ( 128, 128, 5.12e11, 8, 27, 1.4, "LUMI-C-big"), - (1024, 2, 1.28e11, 8, 19, 1.2, "LUMI-G"), # NOTE: you have 8 times the GPUs but each one has it's own memory + # (1024, 2, 1.28e11, 8, 19, 1.2, "LUMI-G"), # NOTE: you have 8 times the GPUs but each one has it's own memory # (1024, 2, 1.28e11, 8, 26, 1.2, "MTW-LUMI-G"), # NOTE: you have 8 times the GPUs but each one has it's own memory + (128, 128, 1.00e09, 8, 19, 1.2, "Deucalion-ARM"), # FIXME add the right parameters for Deucalion ]: - _, _ = suggest_best_3d_problem_decompositions_for_initial_single_node_performance_test( - max_number_of_nodes, processes_per_node, memory, precision, size_q, safety_factor, name, max_number_of_candidates=20, min_distance_between_candidates=1.3, debug=True + _, _ = ( + suggest_best_3d_problem_decompositions_for_initial_single_node_performance_test( + max_number_of_nodes, + processes_per_node, + memory, + precision, + size_q, + safety_factor, + name, + max_number_of_candidates=20, + min_distance_between_candidates=1.2, + prioritize_balanced_decompositions=False, + debug=True, + ) ) # Fluid Q19 -# Temperature Q7 \ No newline at end of file +# Temperature Q7