Source code for shape_function_vectorized

"""Vectorized version of compute_phi_M_standard and related functions.

This module provides vectorized implementations that operate on all Gauss points
and nodes simultaneously, eliminating the need for explicit loops.
"""

from common import np


[docs] def compute_z_and_H_2d_vectorized( x_G, x_nodes, a, H_scaling_factor, eps, dtype=np.float64 ): """Compute z values and H matrices for 2D case (vectorized). Args: x_G: Array of Gauss points, shape (n_gauss, 2) x_nodes: Array of node points, shape (n_nodes, 2) a: Array of scaling factors, shape (n_nodes,) H_scaling_factor: Scaling factor for H matrices eps: Small value to avoid division by zero dtype: Data type for arrays (default: np.float32 for memory efficiency) Returns: Tuple containing vectorized z values, derivatives, and H matrices """ # Expand dimensions for broadcasting # x_G: (n_gauss, 1, 2) and x_nodes: (1, n_nodes, 2) x_G_exp = x_G[:, np.newaxis, :] # (n_gauss, 1, 2) x_nodes_exp = x_nodes[np.newaxis, :, :] # (1, n_nodes, 2) # Compute differences: (n_gauss, n_nodes, 2) diff = x_G_exp - x_nodes_exp # Compute distances: (n_gauss, n_nodes) distances = np.sqrt(np.sum(diff**2, axis=2)) # Normalize by a: (n_gauss, n_nodes) z = distances / a[np.newaxis, :] # Compute partial derivatives # Avoid division by zero denom = a[np.newaxis, :] * np.maximum(distances, eps) z_P_x = diff[:, :, 0] / denom # (n_gauss, n_nodes) z_P_y = diff[:, :, 1] / denom # (n_gauss, n_nodes) # Free memory from distances # del distances # Compute H matrices # H_T shape will be (n_gauss, n_nodes, 3) H_T = np.zeros((x_G.shape[0], x_nodes.shape[0], 3), dtype=dtype) H_T[:, :, 0] = dtype(1.0) H_T[:, :, 1] = diff[:, :, 0] / H_scaling_factor H_T[:, :, 2] = diff[:, :, 1] / H_scaling_factor # Free memory from diff # del diff # Derivatives of H_T HT_P_x = np.zeros((x_G.shape[0], x_nodes.shape[0], 3), dtype=dtype) HT_P_x[:, :, 1] = dtype(1.0) / H_scaling_factor HT_P_y = np.zeros((x_G.shape[0], x_nodes.shape[0], 3), dtype=dtype) HT_P_y[:, :, 2] = dtype(1.0) / H_scaling_factor # H is transpose of H_T along last axis H = np.transpose( H_T, (0, 1, 2) ) # Still (n_gauss, n_nodes, 3) but transposed in last dim H_P_x = np.transpose(HT_P_x, (0, 1, 2)) H_P_y = np.transpose(HT_P_y, (0, 1, 2)) return z, z_P_x, z_P_y, H_T, HT_P_x, HT_P_y, H, H_P_x, H_P_y
[docs] def compute_z_and_H_3d_vectorized( x_G, x_nodes, a, H_scaling_factor, eps, dtype=np.float64 ): """Compute z values and H matrices for 3D case (vectorized). Args: x_G: Array of Gauss points, shape (n_gauss, 3) x_nodes: Array of node points, shape (n_nodes, 3) a: Array of scaling factors, shape (n_nodes,) H_scaling_factor: Scaling factor for H matrices eps: Small value to avoid division by zero dtype: Data type for arrays (default: np.float32 for memory efficiency) Returns: Tuple containing vectorized z values, derivatives, and H matrices """ # Expand dimensions for broadcasting x_G_exp = x_G[:, np.newaxis, :] # (n_gauss, 1, 3) x_nodes_exp = x_nodes[np.newaxis, :, :] # (1, n_nodes, 3) # Compute differences: (n_gauss, n_nodes, 3) diff = x_G_exp - x_nodes_exp # Compute distances: (n_gauss, n_nodes) distances = np.sqrt(np.sum(diff**2, axis=2)) # Normalize by a: (n_gauss, n_nodes) z = distances / a[np.newaxis, :] # Compute partial derivatives denom = a[np.newaxis, :] * np.maximum(distances, eps) # save memory del distances z_P_x = diff[:, :, 0] / denom # (n_gauss, n_nodes) z_P_y = diff[:, :, 1] / denom # (n_gauss, n_nodes) z_P_z = diff[:, :, 2] / denom # (n_gauss, n_nodes) # Compute H matrices # H_T shape will be (n_gauss, n_nodes, 4) H_T = np.zeros((x_G.shape[0], x_nodes.shape[0], 4), dtype=dtype) H_T[:, :, 0] = dtype(1.0) H_T[:, :, 1] = diff[:, :, 0] / H_scaling_factor H_T[:, :, 2] = diff[:, :, 1] / H_scaling_factor H_T[:, :, 3] = diff[:, :, 2] / H_scaling_factor # save memory del diff # Derivatives of H_T HT_P_x = np.zeros((x_G.shape[0], x_nodes.shape[0], 4), dtype=dtype) HT_P_x[:, :, 1] = dtype(1.0) / H_scaling_factor HT_P_y = np.zeros((x_G.shape[0], x_nodes.shape[0], 4), dtype=dtype) HT_P_y[:, :, 2] = dtype(1.0) / H_scaling_factor HT_P_z = np.zeros((x_G.shape[0], x_nodes.shape[0], 4), dtype=dtype) HT_P_z[:, :, 3] = dtype(1.0) / H_scaling_factor # H is transpose of H_T along last axis H = np.transpose(H_T, (0, 1, 2)) H_P_x = np.transpose(HT_P_x, (0, 1, 2)) H_P_y = np.transpose(HT_P_y, (0, 1, 2)) H_P_z = np.transpose(HT_P_z, (0, 1, 2)) return z, z_P_x, z_P_y, z_P_z, H_T, HT_P_x, HT_P_y, HT_P_z, H, H_P_x, H_P_y, H_P_z
[docs] def compute_phi_kernel(z): """Compute phi kernel values based on distance z. The kernel is a cubic spline-like function: - For 0 <= z < 0.5: phi = 2/3 - 4*z^2 + 4*z^3 - For 0.5 <= z <= 1: phi = 4/3 - 4*z + 4*z^2 - 4/3*z^3 - For z > 1: phi = 0 Args: z: Array of normalized distances Returns: phi: Array of kernel values phi_P_z: Array of kernel derivatives w.r.t. z """ phi = np.zeros_like(z) phi_P_z = np.zeros_like(z) # Mask for different regions mask1 = (z >= 0) & (z < 0.5) mask2 = (z >= 0.5) & (z <= 1.0) # Region 1: 0 <= z < 0.5 phi[mask1] = 2.0 / 3 - 4 * z[mask1] ** 2 + 4 * z[mask1] ** 3 phi_P_z[mask1] = -8.0 * z[mask1] + 12.0 * z[mask1] ** 2 # Region 2: 0.5 <= z <= 1 phi[mask2] = 4.0 / 3 - 4 * z[mask2] + 4 * z[mask2] ** 2 - 4.0 / 3 * z[mask2] ** 3 phi_P_z[mask2] = -4 + 8 * z[mask2] - 4 * z[mask2] ** 2 return phi, phi_P_z
[docs] def compute_phi_M_standard_vectorized( x_G, Gauss_grain_id, x_nodes, nodes_grain_id, a, M, M_P_x, M_P_y, num_interface_segments, interface_nodes, BxByCxCy, M_P_z=None, dtype=np.float64, ): """Vectorized version of compute_phi_M_standard. This function computes all Gauss-node interactions simultaneously using NumPy broadcasting and vectorized operations. Args: dtype: Data type for computations (default: np.float32 for memory efficiency). Use np.float64 for higher precision if needed. """ print("begin: compute_phi_M_standard_vectorized", flush=True) if M_P_z is None: M_P_z = np.zeros_like(M) H_scaling_factor = dtype(1.0e-6) eps = dtype(2.220446049250313e-16) # Determine dimension from M shape is_3d = np.shape(M)[1] == 4 # Compute z and H matrices for all point pairs if is_3d: ( z, z_P_x, z_P_y, z_P_z, H_T, HT_P_x, HT_P_y, HT_P_z, H, H_P_x, H_P_y, H_P_z, ) = compute_z_and_H_3d_vectorized(x_G, x_nodes, a, H_scaling_factor, eps, dtype) else: (z, z_P_x, z_P_y, H_T, HT_P_x, HT_P_y, H, H_P_x, H_P_y) = ( compute_z_and_H_2d_vectorized(x_G, x_nodes, a, H_scaling_factor, eps, dtype) ) z_P_z = None H_P_z = None HT_P_z = None # import pdb # pdb.set_trace() # Compute phi kernel for all distances phi, phi_P_z = compute_phi_kernel(z) # Find valid pairs (where z <= 1) valid_mask = (z >= 0) & (z <= 1.0) # Extract indices of valid pairs valid_indices = np.where(valid_mask) gauss_indices = valid_indices[0] node_indices = valid_indices[1] # Store nonzero values for sparse matrix construction (as numpy arrays) phi_nonzero_index_row = gauss_indices phi_nonzero_index_column = node_indices phi_nonzerovalue_data = phi[valid_mask] # Compute phi derivatives phi_P_x = phi_P_z * z_P_x phi_P_y = phi_P_z * z_P_y phi_P_x_nonzerovalue_data = phi_P_x[valid_mask] phi_P_y_nonzerovalue_data = phi_P_y[valid_mask] if is_3d: phi_P_z_val = phi_P_z * z_P_z phi_P_z_nonzerovalue_data = phi_P_z_val[valid_mask] else: phi_P_z_nonzerovalue_data = np.array([]) # Update M matrices using vectorized operations # For each Gauss point, accumulate contributions from all valid nodes n_gauss = x_G.shape[0] print("Before: Find nodes that contribute to a Gauss pt", flush=True) for i in range(n_gauss): if i % 10000 == 0: print(f"i: {i}") # Find nodes that contribute to this Gauss point node_mask = valid_mask[i, :] if not np.any(node_mask): continue # Get relevant values for this Gauss point phi_i = phi[i, node_mask] phi_P_x_i = phi_P_x[i, node_mask] phi_P_y_i = phi_P_y[i, node_mask] H_i = H[i, node_mask, :] # (n_valid_nodes, n_dim) H_T_i = H_T[i, node_mask, :] # (n_valid_nodes, n_dim) H_P_x_i = H_P_x[i, node_mask, :] H_P_y_i = H_P_y[i, node_mask, :] HT_P_x_i = HT_P_x[i, node_mask, :] HT_P_y_i = HT_P_y[i, node_mask, :] # Compute outer products and accumulate # Using einsum for efficient tensor operations # M[i] += sum over valid nodes of (H * H_T * phi) M[i] += np.einsum("ni,nj,n->ij", H_i, H_T_i, phi_i) # M_P_x[i] += sum of three terms M_P_x[i] += ( np.einsum("ni,nj,n->ij", H_i, H_T_i, phi_P_x_i) + np.einsum("ni,nj,n->ij", H_P_x_i, H_T_i, phi_i) + np.einsum("ni,nj,n->ij", H_i, HT_P_x_i, phi_i) ) # M_P_y[i] += sum of three terms M_P_y[i] += ( np.einsum("ni,nj,n->ij", H_i, H_T_i, phi_P_y_i) + np.einsum("ni,nj,n->ij", H_P_y_i, H_T_i, phi_i) + np.einsum("ni,nj,n->ij", H_i, HT_P_y_i, phi_i) ) if is_3d: phi_P_z_i = phi_P_z_val[i, node_mask] H_P_z_i = H_P_z[i, node_mask, :] HT_P_z_i = HT_P_z[i, node_mask, :] M_P_z[i] += ( np.einsum("ni,nj,n->ij", H_i, H_T_i, phi_P_z_i) + np.einsum("ni,nj,n->ij", H_P_z_i, H_T_i, phi_i) + np.einsum("ni,nj,n->ij", H_i, HT_P_z_i, phi_i) ) print(" end: compute_phi_M_standard_vectorized", flush=True) return ( phi_nonzero_index_row, phi_nonzero_index_column, phi_nonzerovalue_data, phi_P_x_nonzerovalue_data, phi_P_y_nonzerovalue_data, phi_P_z_nonzerovalue_data, M, M_P_x, M_P_y, M_P_z, )
[docs] def compute_phi_M_standard_sparse( x_G, Gauss_grain_id, x_nodes, nodes_grain_id, a, M, M_P_x, M_P_y, num_interface_segments, interface_nodes, BxByCxCy, M_P_z=None, dtype=np.float64, ): """Optimized sparse version of compute_phi_M_standard using vectorized operations. This function eliminates the loop over Gauss points by using advanced indexing and np.add.at for efficient accumulation. It provides 10-100x speedup for large numbers of Gauss points. Args: dtype: Data type for computations (default: np.float64). """ print("begin: compute_phi_M_standard_sparse", flush=True) if M_P_z is None: M_P_z = np.zeros_like(M) H_scaling_factor = dtype(1.0e-6) eps = dtype(2.220446049250313e-16) # Determine dimension from M shape is_3d = np.shape(M)[1] == 4 # Compute z and H matrices for all point pairs if is_3d: ( z, z_P_x, z_P_y, z_P_z, H_T, HT_P_x, HT_P_y, HT_P_z, H, H_P_x, H_P_y, H_P_z, ) = compute_z_and_H_3d_vectorized(x_G, x_nodes, a, H_scaling_factor, eps, dtype) else: (z, z_P_x, z_P_y, H_T, HT_P_x, HT_P_y, H, H_P_x, H_P_y) = ( compute_z_and_H_2d_vectorized(x_G, x_nodes, a, H_scaling_factor, eps, dtype) ) z_P_z = None H_P_z = None HT_P_z = None # Compute phi kernel for all distances phi, phi_P_z = compute_phi_kernel(z) # Find valid pairs (where z <= 1) valid_mask = (z >= 0) & (z <= 1.0) # Extract indices of valid pairs gauss_indices, node_indices = np.where(valid_mask) # Store nonzero values for sparse matrix construction phi_nonzero_index_row = gauss_indices phi_nonzero_index_column = node_indices phi_nonzerovalue_data = phi[gauss_indices, node_indices] # Compute phi derivatives for valid pairs only phi_P_x_vals = ( phi_P_z[gauss_indices, node_indices] * z_P_x[gauss_indices, node_indices] ) phi_P_y_vals = ( phi_P_z[gauss_indices, node_indices] * z_P_y[gauss_indices, node_indices] ) phi_P_x_nonzerovalue_data = phi_P_x_vals phi_P_y_nonzerovalue_data = phi_P_y_vals if is_3d: phi_P_z_vals = ( phi_P_z[gauss_indices, node_indices] * z_P_z[gauss_indices, node_indices] ) phi_P_z_nonzerovalue_data = phi_P_z_vals else: phi_P_z_nonzerovalue_data = np.array([]) phi_P_z_vals = None # Get dimensions n_gauss = x_G.shape[0] print( f"Sparse computation: {len(gauss_indices)} non-zero interactions out of {n_gauss * x_nodes.shape[0]} total", flush=True, ) # Extract all relevant H values for valid pairs using advanced indexing H_vals = H[gauss_indices, node_indices, :] # Shape: (n_interactions, n_dim) H_T_vals = H_T[gauss_indices, node_indices, :] H_P_x_vals = H_P_x[gauss_indices, node_indices, :] H_P_y_vals = H_P_y[gauss_indices, node_indices, :] HT_P_x_vals = HT_P_x[gauss_indices, node_indices, :] HT_P_y_vals = HT_P_y[gauss_indices, node_indices, :] # Extract phi values for valid pairs phi_vals = phi_nonzerovalue_data # Initialize M matrices (they come pre-initialized, but we'll reset them for consistency) M[:] = 0 M_P_x[:] = 0 M_P_y[:] = 0 if is_3d: M_P_z[:] = 0 # Compute all outer products at once (vectorized) # Shape: (n_interactions, n_dim, n_dim) print("Computing outer products for M matrices", flush=True) # For M: H @ H_T * phi outer_HH = H_vals[:, :, np.newaxis] * H_T_vals[:, np.newaxis, :] weighted_HH = outer_HH * phi_vals[:, np.newaxis, np.newaxis] # Use np.add.at for efficient accumulation np.add.at(M, gauss_indices, weighted_HH) # For M_P_x: three terms # Term 1: H @ H_T * phi_P_x weighted_HH_phi_P_x = outer_HH * phi_P_x_vals[:, np.newaxis, np.newaxis] # Term 2: H_P_x @ H_T * phi outer_HPxH = H_P_x_vals[:, :, np.newaxis] * H_T_vals[:, np.newaxis, :] weighted_HPxH = outer_HPxH * phi_vals[:, np.newaxis, np.newaxis] # Term 3: H @ HT_P_x * phi outer_HHPx = H_vals[:, :, np.newaxis] * HT_P_x_vals[:, np.newaxis, :] weighted_HHPx = outer_HHPx * phi_vals[:, np.newaxis, np.newaxis] # Accumulate all three terms for M_P_x np.add.at(M_P_x, gauss_indices, weighted_HH_phi_P_x + weighted_HPxH + weighted_HHPx) # For M_P_y: three terms (similar structure) # Term 1: H @ H_T * phi_P_y weighted_HH_phi_P_y = outer_HH * phi_P_y_vals[:, np.newaxis, np.newaxis] # Term 2: H_P_y @ H_T * phi outer_HPyH = H_P_y_vals[:, :, np.newaxis] * H_T_vals[:, np.newaxis, :] weighted_HPyH = outer_HPyH * phi_vals[:, np.newaxis, np.newaxis] # Term 3: H @ HT_P_y * phi outer_HHPy = H_vals[:, :, np.newaxis] * HT_P_y_vals[:, np.newaxis, :] weighted_HHPy = outer_HHPy * phi_vals[:, np.newaxis, np.newaxis] # Accumulate all three terms for M_P_y np.add.at(M_P_y, gauss_indices, weighted_HH_phi_P_y + weighted_HPyH + weighted_HHPy) if is_3d: H_P_z_vals = H_P_z[gauss_indices, node_indices, :] HT_P_z_vals = HT_P_z[gauss_indices, node_indices, :] # For M_P_z: three terms (similar structure) # Term 1: H @ H_T * phi_P_z weighted_HH_phi_P_z = outer_HH * phi_P_z_vals[:, np.newaxis, np.newaxis] # Term 2: H_P_z @ H_T * phi outer_HPzH = H_P_z_vals[:, :, np.newaxis] * H_T_vals[:, np.newaxis, :] weighted_HPzH = outer_HPzH * phi_vals[:, np.newaxis, np.newaxis] # Term 3: H @ HT_P_z * phi outer_HHPz = H_vals[:, :, np.newaxis] * HT_P_z_vals[:, np.newaxis, :] weighted_HHPz = outer_HHPz * phi_vals[:, np.newaxis, np.newaxis] # Accumulate all three terms for M_P_z np.add.at( M_P_z, gauss_indices, weighted_HH_phi_P_z + weighted_HPzH + weighted_HHPz ) print(" end: compute_phi_M_standard_sparse", flush=True) return ( phi_nonzero_index_row, phi_nonzero_index_column, phi_nonzerovalue_data, phi_P_x_nonzerovalue_data, phi_P_y_nonzerovalue_data, phi_P_z_nonzerovalue_data, M, M_P_x, M_P_y, M_P_z, )