8175813: PPC64: "mbind: Invalid argument" when -XX:+UseNUMA is used

Thu, 01 Jun 2017 20:42:49 -0400

author
gromero
date
Thu, 01 Jun 2017 20:42:49 -0400
changeset 8776
4a575a49e938
parent 8775
3c3a934f88c2
child 8777
09d0d56ca735

8175813: PPC64: "mbind: Invalid argument" when -XX:+UseNUMA is used
Reviewed-by: dholmes, zgu

src/os/linux/vm/os_linux.cpp file | annotate | diff | comparison | revisions
src/os/linux/vm/os_linux.hpp file | annotate | diff | comparison | revisions
     1.1 --- a/src/os/linux/vm/os_linux.cpp	Tue May 30 08:17:41 2017 +0000
     1.2 +++ b/src/os/linux/vm/os_linux.cpp	Thu Jun 01 20:42:49 2017 -0400
     1.3 @@ -2736,8 +2736,9 @@
     1.4  bool os::numa_topology_changed()   { return false; }
     1.5  
     1.6  size_t os::numa_get_groups_num() {
     1.7 -  int max_node = Linux::numa_max_node();
     1.8 -  return max_node > 0 ? max_node + 1 : 1;
     1.9 +  // Return just the number of nodes in which it's possible to allocate memory
    1.10 +  // (in numa terminology, configured nodes).
    1.11 +  return Linux::numa_num_configured_nodes();
    1.12  }
    1.13  
    1.14  int os::numa_get_group_id() {
    1.15 @@ -2751,11 +2752,33 @@
    1.16    return 0;
    1.17  }
    1.18  
    1.19 +int os::Linux::get_existing_num_nodes() {
    1.20 +  size_t node;
    1.21 +  size_t highest_node_number = Linux::numa_max_node();
    1.22 +  int num_nodes = 0;
    1.23 +
    1.24 +  // Get the total number of nodes in the system including nodes without memory.
    1.25 +  for (node = 0; node <= highest_node_number; node++) {
    1.26 +    if (isnode_in_existing_nodes(node)) {
    1.27 +      num_nodes++;
    1.28 +    }
    1.29 +  }
    1.30 +  return num_nodes;
    1.31 +}
    1.32 +
    1.33  size_t os::numa_get_leaf_groups(int *ids, size_t size) {
    1.34 -  for (size_t i = 0; i < size; i++) {
    1.35 -    ids[i] = i;
    1.36 -  }
    1.37 -  return size;
    1.38 +  size_t highest_node_number = Linux::numa_max_node();
    1.39 +  size_t i = 0;
    1.40 +
    1.41 +  // Map all node ids in which is possible to allocate memory. Also nodes are
    1.42 +  // not always consecutively available, i.e. available from 0 to the highest
    1.43 +  // node number.
    1.44 +  for (size_t node = 0; node <= highest_node_number; node++) {
    1.45 +    if (Linux::isnode_in_configured_nodes(node)) {
    1.46 +      ids[i++] = node;
    1.47 +    }
    1.48 +  }
    1.49 +  return i;
    1.50  }
    1.51  
    1.52  bool os::get_page_info(char *start, page_info* info) {
    1.53 @@ -2825,18 +2848,28 @@
    1.54                                             libnuma_dlsym(handle, "numa_node_to_cpus")));
    1.55        set_numa_max_node(CAST_TO_FN_PTR(numa_max_node_func_t,
    1.56                                         libnuma_dlsym(handle, "numa_max_node")));
    1.57 +      set_numa_num_configured_nodes(CAST_TO_FN_PTR(numa_num_configured_nodes_func_t,
    1.58 +                                                   libnuma_dlsym(handle, "numa_num_configured_nodes")));
    1.59        set_numa_available(CAST_TO_FN_PTR(numa_available_func_t,
    1.60                                          libnuma_dlsym(handle, "numa_available")));
    1.61        set_numa_tonode_memory(CAST_TO_FN_PTR(numa_tonode_memory_func_t,
    1.62                                              libnuma_dlsym(handle, "numa_tonode_memory")));
    1.63        set_numa_interleave_memory(CAST_TO_FN_PTR(numa_interleave_memory_func_t,
    1.64 -                                            libnuma_dlsym(handle, "numa_interleave_memory")));
    1.65 +                                                libnuma_dlsym(handle, "numa_interleave_memory")));
    1.66        set_numa_set_bind_policy(CAST_TO_FN_PTR(numa_set_bind_policy_func_t,
    1.67 -                                            libnuma_dlsym(handle, "numa_set_bind_policy")));
    1.68 -
    1.69 +                                              libnuma_dlsym(handle, "numa_set_bind_policy")));
    1.70 +      set_numa_bitmask_isbitset(CAST_TO_FN_PTR(numa_bitmask_isbitset_func_t,
    1.71 +                                               libnuma_dlsym(handle, "numa_bitmask_isbitset")));
    1.72 +      set_numa_distance(CAST_TO_FN_PTR(numa_distance_func_t,
    1.73 +                                       libnuma_dlsym(handle, "numa_distance")));
    1.74  
    1.75        if (numa_available() != -1) {
    1.76          set_numa_all_nodes((unsigned long*)libnuma_dlsym(handle, "numa_all_nodes"));
    1.77 +        set_numa_all_nodes_ptr((struct bitmask **)libnuma_dlsym(handle, "numa_all_nodes_ptr"));
    1.78 +        set_numa_nodes_ptr((struct bitmask **)libnuma_dlsym(handle, "numa_nodes_ptr"));
    1.79 +        // Create an index -> node mapping, since nodes are not always consecutive
    1.80 +        _nindex_to_node = new (ResourceObj::C_HEAP, mtInternal) GrowableArray<int>(0, true);
    1.81 +        rebuild_nindex_to_node_map();
    1.82          // Create a cpu -> node mapping
    1.83          _cpu_to_node = new (ResourceObj::C_HEAP, mtInternal) GrowableArray<int>(0, true);
    1.84          rebuild_cpu_to_node_map();
    1.85 @@ -2847,6 +2880,17 @@
    1.86    return false;
    1.87  }
    1.88  
    1.89 +void os::Linux::rebuild_nindex_to_node_map() {
    1.90 +  int highest_node_number = Linux::numa_max_node();
    1.91 +
    1.92 +  nindex_to_node()->clear();
    1.93 +  for (int node = 0; node <= highest_node_number; node++) {
    1.94 +    if (Linux::isnode_in_existing_nodes(node)) {
    1.95 +      nindex_to_node()->append(node);
    1.96 +    }
    1.97 +  }
    1.98 +}
    1.99 +
   1.100  // rebuild_cpu_to_node_map() constructs a table mapping cpud id to node id.
   1.101  // The table is later used in get_node_by_cpu().
   1.102  void os::Linux::rebuild_cpu_to_node_map() {
   1.103 @@ -2866,16 +2910,46 @@
   1.104  
   1.105    cpu_to_node()->clear();
   1.106    cpu_to_node()->at_grow(cpu_num - 1);
   1.107 -  size_t node_num = numa_get_groups_num();
   1.108 -
   1.109 +
   1.110 +  size_t node_num = get_existing_num_nodes();
   1.111 +
   1.112 +  int distance = 0;
   1.113 +  int closest_distance = INT_MAX;
   1.114 +  int closest_node = 0;
   1.115    unsigned long *cpu_map = NEW_C_HEAP_ARRAY(unsigned long, cpu_map_size, mtInternal);
   1.116    for (size_t i = 0; i < node_num; i++) {
   1.117 -    if (numa_node_to_cpus(i, cpu_map, cpu_map_size * sizeof(unsigned long)) != -1) {
   1.118 +    // Check if node is configured (not a memory-less node). If it is not, find
   1.119 +    // the closest configured node.
   1.120 +    if (!isnode_in_configured_nodes(nindex_to_node()->at(i))) {
   1.121 +      closest_distance = INT_MAX;
   1.122 +      // Check distance from all remaining nodes in the system. Ignore distance
   1.123 +      // from itself and from another non-configured node.
   1.124 +      for (size_t m = 0; m < node_num; m++) {
   1.125 +        if (m != i && isnode_in_configured_nodes(nindex_to_node()->at(m))) {
   1.126 +          distance = numa_distance(nindex_to_node()->at(i), nindex_to_node()->at(m));
   1.127 +          // If a closest node is found, update. There is always at least one
   1.128 +          // configured node in the system so there is always at least one node
   1.129 +          // close.
   1.130 +          if (distance != 0 && distance < closest_distance) {
   1.131 +            closest_distance = distance;
   1.132 +            closest_node = nindex_to_node()->at(m);
   1.133 +          }
   1.134 +        }
   1.135 +      }
   1.136 +     } else {
   1.137 +       // Current node is already a configured node.
   1.138 +       closest_node = nindex_to_node()->at(i);
   1.139 +     }
   1.140 +
   1.141 +    // Get cpus from the original node and map them to the closest node. If node
   1.142 +    // is a configured node (not a memory-less node), then original node and
   1.143 +    // closest node are the same.
   1.144 +    if (numa_node_to_cpus(nindex_to_node()->at(i), cpu_map, cpu_map_size * sizeof(unsigned long)) != -1) {
   1.145        for (size_t j = 0; j < cpu_map_valid_size; j++) {
   1.146          if (cpu_map[j] != 0) {
   1.147            for (size_t k = 0; k < BitsPerCLong; k++) {
   1.148              if (cpu_map[j] & (1UL << k)) {
   1.149 -              cpu_to_node()->at_put(j * BitsPerCLong + k, i);
   1.150 +              cpu_to_node()->at_put(j * BitsPerCLong + k, closest_node);
   1.151              }
   1.152            }
   1.153          }
   1.154 @@ -2893,14 +2967,20 @@
   1.155  }
   1.156  
   1.157  GrowableArray<int>* os::Linux::_cpu_to_node;
   1.158 +GrowableArray<int>* os::Linux::_nindex_to_node;
   1.159  os::Linux::sched_getcpu_func_t os::Linux::_sched_getcpu;
   1.160  os::Linux::numa_node_to_cpus_func_t os::Linux::_numa_node_to_cpus;
   1.161  os::Linux::numa_max_node_func_t os::Linux::_numa_max_node;
   1.162 +os::Linux::numa_num_configured_nodes_func_t os::Linux::_numa_num_configured_nodes;
   1.163  os::Linux::numa_available_func_t os::Linux::_numa_available;
   1.164  os::Linux::numa_tonode_memory_func_t os::Linux::_numa_tonode_memory;
   1.165  os::Linux::numa_interleave_memory_func_t os::Linux::_numa_interleave_memory;
   1.166  os::Linux::numa_set_bind_policy_func_t os::Linux::_numa_set_bind_policy;
   1.167 +os::Linux::numa_bitmask_isbitset_func_t os::Linux::_numa_bitmask_isbitset;
   1.168 +os::Linux::numa_distance_func_t os::Linux::_numa_distance;
   1.169  unsigned long* os::Linux::_numa_all_nodes;
   1.170 +struct bitmask* os::Linux::_numa_all_nodes_ptr;
   1.171 +struct bitmask* os::Linux::_numa_nodes_ptr;
   1.172  
   1.173  bool os::pd_uncommit_memory(char* addr, size_t size) {
   1.174    uintptr_t res = (uintptr_t) ::mmap(addr, size, PROT_NONE,
     2.1 --- a/src/os/linux/vm/os_linux.hpp	Tue May 30 08:17:41 2017 +0000
     2.2 +++ b/src/os/linux/vm/os_linux.hpp	Thu Jun 01 20:42:49 2017 -0400
     2.3 @@ -67,6 +67,7 @@
     2.4    static bool _supports_fast_thread_cpu_time;
     2.5  
     2.6    static GrowableArray<int>* _cpu_to_node;
     2.7 +  static GrowableArray<int>* _nindex_to_node;
     2.8  
     2.9   protected:
    2.10  
    2.11 @@ -94,7 +95,9 @@
    2.12    static void set_is_floating_stack()         { _is_floating_stack = true; }
    2.13  
    2.14    static void rebuild_cpu_to_node_map();
    2.15 +  static void rebuild_nindex_to_node_map();
    2.16    static GrowableArray<int>* cpu_to_node()    { return _cpu_to_node; }
    2.17 +  static GrowableArray<int>* nindex_to_node()  { return _nindex_to_node; }
    2.18  
    2.19    static size_t find_large_page_size();
    2.20    static size_t setup_large_page_size();
    2.21 @@ -243,28 +246,41 @@
    2.22    typedef int (*sched_getcpu_func_t)(void);
    2.23    typedef int (*numa_node_to_cpus_func_t)(int node, unsigned long *buffer, int bufferlen);
    2.24    typedef int (*numa_max_node_func_t)(void);
    2.25 +  typedef int (*numa_num_configured_nodes_func_t)(void);
    2.26    typedef int (*numa_available_func_t)(void);
    2.27    typedef int (*numa_tonode_memory_func_t)(void *start, size_t size, int node);
    2.28    typedef void (*numa_interleave_memory_func_t)(void *start, size_t size, unsigned long *nodemask);
    2.29    typedef void (*numa_set_bind_policy_func_t)(int policy);
    2.30 +  typedef int (*numa_bitmask_isbitset_func_t)(struct bitmask *bmp, unsigned int n);
    2.31 +  typedef int (*numa_distance_func_t)(int node1, int node2);
    2.32  
    2.33    static sched_getcpu_func_t _sched_getcpu;
    2.34    static numa_node_to_cpus_func_t _numa_node_to_cpus;
    2.35    static numa_max_node_func_t _numa_max_node;
    2.36 +  static numa_num_configured_nodes_func_t _numa_num_configured_nodes;
    2.37    static numa_available_func_t _numa_available;
    2.38    static numa_tonode_memory_func_t _numa_tonode_memory;
    2.39    static numa_interleave_memory_func_t _numa_interleave_memory;
    2.40    static numa_set_bind_policy_func_t _numa_set_bind_policy;
    2.41 +  static numa_bitmask_isbitset_func_t _numa_bitmask_isbitset;
    2.42 +  static numa_distance_func_t _numa_distance;
    2.43    static unsigned long* _numa_all_nodes;
    2.44 +  static struct bitmask* _numa_all_nodes_ptr;
    2.45 +  static struct bitmask* _numa_nodes_ptr;
    2.46  
    2.47    static void set_sched_getcpu(sched_getcpu_func_t func) { _sched_getcpu = func; }
    2.48    static void set_numa_node_to_cpus(numa_node_to_cpus_func_t func) { _numa_node_to_cpus = func; }
    2.49    static void set_numa_max_node(numa_max_node_func_t func) { _numa_max_node = func; }
    2.50 +  static void set_numa_num_configured_nodes(numa_num_configured_nodes_func_t func) { _numa_num_configured_nodes = func; }
    2.51    static void set_numa_available(numa_available_func_t func) { _numa_available = func; }
    2.52    static void set_numa_tonode_memory(numa_tonode_memory_func_t func) { _numa_tonode_memory = func; }
    2.53    static void set_numa_interleave_memory(numa_interleave_memory_func_t func) { _numa_interleave_memory = func; }
    2.54    static void set_numa_set_bind_policy(numa_set_bind_policy_func_t func) { _numa_set_bind_policy = func; }
    2.55 +  static void set_numa_bitmask_isbitset(numa_bitmask_isbitset_func_t func) { _numa_bitmask_isbitset = func; }
    2.56 +  static void set_numa_distance(numa_distance_func_t func) { _numa_distance = func; }
    2.57    static void set_numa_all_nodes(unsigned long* ptr) { _numa_all_nodes = ptr; }
    2.58 +  static void set_numa_all_nodes_ptr(struct bitmask **ptr) { _numa_all_nodes_ptr = *ptr; }
    2.59 +  static void set_numa_nodes_ptr(struct bitmask **ptr) { _numa_nodes_ptr = *ptr; }
    2.60    static int sched_getcpu_syscall(void);
    2.61  public:
    2.62    static int sched_getcpu()  { return _sched_getcpu != NULL ? _sched_getcpu() : -1; }
    2.63 @@ -272,6 +288,9 @@
    2.64      return _numa_node_to_cpus != NULL ? _numa_node_to_cpus(node, buffer, bufferlen) : -1;
    2.65    }
    2.66    static int numa_max_node() { return _numa_max_node != NULL ? _numa_max_node() : -1; }
    2.67 +  static int numa_num_configured_nodes() {
    2.68 +    return _numa_num_configured_nodes != NULL ? _numa_num_configured_nodes() : -1;
    2.69 +  }
    2.70    static int numa_available() { return _numa_available != NULL ? _numa_available() : -1; }
    2.71    static int numa_tonode_memory(void *start, size_t size, int node) {
    2.72      return _numa_tonode_memory != NULL ? _numa_tonode_memory(start, size, node) : -1;
    2.73 @@ -286,7 +305,25 @@
    2.74        _numa_set_bind_policy(policy);
    2.75      }
    2.76    }
    2.77 +  static int numa_distance(int node1, int node2) {
    2.78 +    return _numa_distance != NULL ? _numa_distance(node1, node2) : -1;
    2.79 +  }
    2.80    static int get_node_by_cpu(int cpu_id);
    2.81 +  static int get_existing_num_nodes();
    2.82 +  // Check if numa node is configured (non-zero memory node).
    2.83 +  static bool isnode_in_configured_nodes(unsigned int n) {
    2.84 +    if (_numa_bitmask_isbitset != NULL && _numa_all_nodes_ptr != NULL) {
    2.85 +      return _numa_bitmask_isbitset(_numa_all_nodes_ptr, n);
    2.86 +    } else
    2.87 +      return 0;
    2.88 +  }
    2.89 +  // Check if numa node exists in the system (including zero memory nodes).
    2.90 +  static bool isnode_in_existing_nodes(unsigned int n) {
    2.91 +    if (_numa_bitmask_isbitset != NULL && _numa_nodes_ptr != NULL) {
    2.92 +      return _numa_bitmask_isbitset(_numa_nodes_ptr, n);
    2.93 +    } else
    2.94 +      return 0;
    2.95 +  }
    2.96  };
    2.97  
    2.98  

mercurial