Tue, 02 Nov 2010 09:00:37 -0700
6987135: Performance regression on Intel platform with 32-bits edition between 6u13 and 6u14.
Summary: Use hardware DIV instruction for long division by constant when it is faster than code with multiply.
Reviewed-by: never
twisti@1020 | 1 | /* |
kvn@1977 | 2 | * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All Rights Reserved. |
twisti@1020 | 3 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
twisti@1020 | 4 | * |
twisti@1020 | 5 | * This code is free software; you can redistribute it and/or modify it |
twisti@1020 | 6 | * under the terms of the GNU General Public License version 2 only, as |
twisti@1020 | 7 | * published by the Free Software Foundation. |
twisti@1020 | 8 | * |
twisti@1020 | 9 | * This code is distributed in the hope that it will be useful, but WITHOUT |
twisti@1020 | 10 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
twisti@1020 | 11 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
twisti@1020 | 12 | * version 2 for more details (a copy is included in the LICENSE file that |
twisti@1020 | 13 | * accompanied this code). |
twisti@1020 | 14 | * |
twisti@1020 | 15 | * You should have received a copy of the GNU General Public License version |
twisti@1020 | 16 | * 2 along with this work; if not, write to the Free Software Foundation, |
twisti@1020 | 17 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
twisti@1020 | 18 | * |
trims@1907 | 19 | * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
trims@1907 | 20 | * or visit www.oracle.com if you need additional information or have any |
trims@1907 | 21 | * questions. |
twisti@1020 | 22 | * |
twisti@1020 | 23 | */ |
twisti@1020 | 24 | |
twisti@1020 | 25 | class VM_Version : public Abstract_VM_Version { |
twisti@1020 | 26 | public: |
twisti@1020 | 27 | // cpuid result register layouts. These are all unions of a uint32_t |
twisti@1020 | 28 | // (in case anyone wants access to the register as a whole) and a bitfield. |
twisti@1020 | 29 | |
twisti@1020 | 30 | union StdCpuid1Eax { |
twisti@1020 | 31 | uint32_t value; |
twisti@1020 | 32 | struct { |
twisti@1020 | 33 | uint32_t stepping : 4, |
twisti@1020 | 34 | model : 4, |
twisti@1020 | 35 | family : 4, |
twisti@1020 | 36 | proc_type : 2, |
twisti@1020 | 37 | : 2, |
twisti@1020 | 38 | ext_model : 4, |
twisti@1020 | 39 | ext_family : 8, |
twisti@1020 | 40 | : 4; |
twisti@1020 | 41 | } bits; |
twisti@1020 | 42 | }; |
twisti@1020 | 43 | |
twisti@1020 | 44 | union StdCpuid1Ebx { // example, unused |
twisti@1020 | 45 | uint32_t value; |
twisti@1020 | 46 | struct { |
twisti@1020 | 47 | uint32_t brand_id : 8, |
twisti@1020 | 48 | clflush_size : 8, |
twisti@1020 | 49 | threads_per_cpu : 8, |
twisti@1020 | 50 | apic_id : 8; |
twisti@1020 | 51 | } bits; |
twisti@1020 | 52 | }; |
twisti@1020 | 53 | |
twisti@1020 | 54 | union StdCpuid1Ecx { |
twisti@1020 | 55 | uint32_t value; |
twisti@1020 | 56 | struct { |
twisti@1020 | 57 | uint32_t sse3 : 1, |
twisti@1020 | 58 | : 2, |
twisti@1020 | 59 | monitor : 1, |
twisti@1020 | 60 | : 1, |
twisti@1020 | 61 | vmx : 1, |
twisti@1020 | 62 | : 1, |
twisti@1020 | 63 | est : 1, |
twisti@1020 | 64 | : 1, |
twisti@1020 | 65 | ssse3 : 1, |
twisti@1020 | 66 | cid : 1, |
twisti@1020 | 67 | : 2, |
twisti@1020 | 68 | cmpxchg16: 1, |
twisti@1020 | 69 | : 4, |
twisti@1020 | 70 | dca : 1, |
twisti@1020 | 71 | sse4_1 : 1, |
twisti@1020 | 72 | sse4_2 : 1, |
twisti@1078 | 73 | : 2, |
twisti@1078 | 74 | popcnt : 1, |
twisti@1078 | 75 | : 8; |
twisti@1020 | 76 | } bits; |
twisti@1020 | 77 | }; |
twisti@1020 | 78 | |
twisti@1020 | 79 | union StdCpuid1Edx { |
twisti@1020 | 80 | uint32_t value; |
twisti@1020 | 81 | struct { |
twisti@1020 | 82 | uint32_t : 4, |
twisti@1020 | 83 | tsc : 1, |
twisti@1020 | 84 | : 3, |
twisti@1020 | 85 | cmpxchg8 : 1, |
twisti@1020 | 86 | : 6, |
twisti@1020 | 87 | cmov : 1, |
twisti@1020 | 88 | : 7, |
twisti@1020 | 89 | mmx : 1, |
twisti@1020 | 90 | fxsr : 1, |
twisti@1020 | 91 | sse : 1, |
twisti@1020 | 92 | sse2 : 1, |
twisti@1020 | 93 | : 1, |
twisti@1020 | 94 | ht : 1, |
twisti@1020 | 95 | : 3; |
twisti@1020 | 96 | } bits; |
twisti@1020 | 97 | }; |
twisti@1020 | 98 | |
twisti@1020 | 99 | union DcpCpuid4Eax { |
twisti@1020 | 100 | uint32_t value; |
twisti@1020 | 101 | struct { |
twisti@1020 | 102 | uint32_t cache_type : 5, |
twisti@1020 | 103 | : 21, |
twisti@1020 | 104 | cores_per_cpu : 6; |
twisti@1020 | 105 | } bits; |
twisti@1020 | 106 | }; |
twisti@1020 | 107 | |
twisti@1020 | 108 | union DcpCpuid4Ebx { |
twisti@1020 | 109 | uint32_t value; |
twisti@1020 | 110 | struct { |
twisti@1020 | 111 | uint32_t L1_line_size : 12, |
twisti@1020 | 112 | partitions : 10, |
twisti@1020 | 113 | associativity : 10; |
twisti@1020 | 114 | } bits; |
twisti@1020 | 115 | }; |
twisti@1020 | 116 | |
kvn@1977 | 117 | union TplCpuidBEbx { |
kvn@1977 | 118 | uint32_t value; |
kvn@1977 | 119 | struct { |
kvn@1977 | 120 | uint32_t logical_cpus : 16, |
kvn@1977 | 121 | : 16; |
kvn@1977 | 122 | } bits; |
kvn@1977 | 123 | }; |
kvn@1977 | 124 | |
twisti@1020 | 125 | union ExtCpuid1Ecx { |
twisti@1020 | 126 | uint32_t value; |
twisti@1020 | 127 | struct { |
twisti@1020 | 128 | uint32_t LahfSahf : 1, |
twisti@1020 | 129 | CmpLegacy : 1, |
twisti@1020 | 130 | : 4, |
twisti@1210 | 131 | lzcnt : 1, |
twisti@1020 | 132 | sse4a : 1, |
twisti@1020 | 133 | misalignsse : 1, |
twisti@1020 | 134 | prefetchw : 1, |
twisti@1020 | 135 | : 22; |
twisti@1020 | 136 | } bits; |
twisti@1020 | 137 | }; |
twisti@1020 | 138 | |
twisti@1020 | 139 | union ExtCpuid1Edx { |
twisti@1020 | 140 | uint32_t value; |
twisti@1020 | 141 | struct { |
twisti@1020 | 142 | uint32_t : 22, |
twisti@1020 | 143 | mmx_amd : 1, |
twisti@1020 | 144 | mmx : 1, |
twisti@1020 | 145 | fxsr : 1, |
twisti@1020 | 146 | : 4, |
twisti@1020 | 147 | long_mode : 1, |
twisti@1020 | 148 | tdnow2 : 1, |
twisti@1020 | 149 | tdnow : 1; |
twisti@1020 | 150 | } bits; |
twisti@1020 | 151 | }; |
twisti@1020 | 152 | |
twisti@1020 | 153 | union ExtCpuid5Ex { |
twisti@1020 | 154 | uint32_t value; |
twisti@1020 | 155 | struct { |
twisti@1020 | 156 | uint32_t L1_line_size : 8, |
twisti@1020 | 157 | L1_tag_lines : 8, |
twisti@1020 | 158 | L1_assoc : 8, |
twisti@1020 | 159 | L1_size : 8; |
twisti@1020 | 160 | } bits; |
twisti@1020 | 161 | }; |
twisti@1020 | 162 | |
twisti@1020 | 163 | union ExtCpuid8Ecx { |
twisti@1020 | 164 | uint32_t value; |
twisti@1020 | 165 | struct { |
twisti@1020 | 166 | uint32_t cores_per_cpu : 8, |
twisti@1020 | 167 | : 24; |
twisti@1020 | 168 | } bits; |
twisti@1020 | 169 | }; |
twisti@1020 | 170 | |
twisti@1020 | 171 | protected: |
twisti@1020 | 172 | static int _cpu; |
twisti@1020 | 173 | static int _model; |
twisti@1020 | 174 | static int _stepping; |
twisti@1020 | 175 | static int _cpuFeatures; // features returned by the "cpuid" instruction |
twisti@1020 | 176 | // 0 if this instruction is not available |
twisti@1020 | 177 | static const char* _features_str; |
twisti@1020 | 178 | |
twisti@1020 | 179 | enum { |
twisti@1020 | 180 | CPU_CX8 = (1 << 0), // next bits are from cpuid 1 (EDX) |
twisti@1020 | 181 | CPU_CMOV = (1 << 1), |
twisti@1020 | 182 | CPU_FXSR = (1 << 2), |
twisti@1020 | 183 | CPU_HT = (1 << 3), |
twisti@1020 | 184 | CPU_MMX = (1 << 4), |
twisti@1020 | 185 | CPU_3DNOW = (1 << 5), // 3DNow comes from cpuid 0x80000001 (EDX) |
twisti@1020 | 186 | CPU_SSE = (1 << 6), |
twisti@1020 | 187 | CPU_SSE2 = (1 << 7), |
twisti@1020 | 188 | CPU_SSE3 = (1 << 8), // SSE3 comes from cpuid 1 (ECX) |
twisti@1020 | 189 | CPU_SSSE3 = (1 << 9), |
twisti@1020 | 190 | CPU_SSE4A = (1 << 10), |
twisti@1020 | 191 | CPU_SSE4_1 = (1 << 11), |
twisti@1078 | 192 | CPU_SSE4_2 = (1 << 12), |
twisti@1210 | 193 | CPU_POPCNT = (1 << 13), |
twisti@1210 | 194 | CPU_LZCNT = (1 << 14) |
twisti@1020 | 195 | } cpuFeatureFlags; |
twisti@1020 | 196 | |
twisti@1020 | 197 | // cpuid information block. All info derived from executing cpuid with |
twisti@1020 | 198 | // various function numbers is stored here. Intel and AMD info is |
twisti@1020 | 199 | // merged in this block: accessor methods disentangle it. |
twisti@1020 | 200 | // |
twisti@1020 | 201 | // The info block is laid out in subblocks of 4 dwords corresponding to |
twisti@1020 | 202 | // eax, ebx, ecx and edx, whether or not they contain anything useful. |
twisti@1020 | 203 | struct CpuidInfo { |
twisti@1020 | 204 | // cpuid function 0 |
twisti@1020 | 205 | uint32_t std_max_function; |
twisti@1020 | 206 | uint32_t std_vendor_name_0; |
twisti@1020 | 207 | uint32_t std_vendor_name_1; |
twisti@1020 | 208 | uint32_t std_vendor_name_2; |
twisti@1020 | 209 | |
twisti@1020 | 210 | // cpuid function 1 |
twisti@1020 | 211 | StdCpuid1Eax std_cpuid1_eax; |
twisti@1020 | 212 | StdCpuid1Ebx std_cpuid1_ebx; |
twisti@1020 | 213 | StdCpuid1Ecx std_cpuid1_ecx; |
twisti@1020 | 214 | StdCpuid1Edx std_cpuid1_edx; |
twisti@1020 | 215 | |
twisti@1020 | 216 | // cpuid function 4 (deterministic cache parameters) |
twisti@1020 | 217 | DcpCpuid4Eax dcp_cpuid4_eax; |
twisti@1020 | 218 | DcpCpuid4Ebx dcp_cpuid4_ebx; |
twisti@1020 | 219 | uint32_t dcp_cpuid4_ecx; // unused currently |
twisti@1020 | 220 | uint32_t dcp_cpuid4_edx; // unused currently |
twisti@1020 | 221 | |
kvn@1977 | 222 | // cpuid function 0xB (processor topology) |
kvn@1977 | 223 | // ecx = 0 |
kvn@1977 | 224 | uint32_t tpl_cpuidB0_eax; |
kvn@1977 | 225 | TplCpuidBEbx tpl_cpuidB0_ebx; |
kvn@1977 | 226 | uint32_t tpl_cpuidB0_ecx; // unused currently |
kvn@1977 | 227 | uint32_t tpl_cpuidB0_edx; // unused currently |
kvn@1977 | 228 | |
kvn@1977 | 229 | // ecx = 1 |
kvn@1977 | 230 | uint32_t tpl_cpuidB1_eax; |
kvn@1977 | 231 | TplCpuidBEbx tpl_cpuidB1_ebx; |
kvn@1977 | 232 | uint32_t tpl_cpuidB1_ecx; // unused currently |
kvn@1977 | 233 | uint32_t tpl_cpuidB1_edx; // unused currently |
kvn@1977 | 234 | |
kvn@1977 | 235 | // ecx = 2 |
kvn@1977 | 236 | uint32_t tpl_cpuidB2_eax; |
kvn@1977 | 237 | TplCpuidBEbx tpl_cpuidB2_ebx; |
kvn@1977 | 238 | uint32_t tpl_cpuidB2_ecx; // unused currently |
kvn@1977 | 239 | uint32_t tpl_cpuidB2_edx; // unused currently |
kvn@1977 | 240 | |
twisti@1020 | 241 | // cpuid function 0x80000000 // example, unused |
twisti@1020 | 242 | uint32_t ext_max_function; |
twisti@1020 | 243 | uint32_t ext_vendor_name_0; |
twisti@1020 | 244 | uint32_t ext_vendor_name_1; |
twisti@1020 | 245 | uint32_t ext_vendor_name_2; |
twisti@1020 | 246 | |
twisti@1020 | 247 | // cpuid function 0x80000001 |
twisti@1020 | 248 | uint32_t ext_cpuid1_eax; // reserved |
twisti@1020 | 249 | uint32_t ext_cpuid1_ebx; // reserved |
twisti@1020 | 250 | ExtCpuid1Ecx ext_cpuid1_ecx; |
twisti@1020 | 251 | ExtCpuid1Edx ext_cpuid1_edx; |
twisti@1020 | 252 | |
twisti@1020 | 253 | // cpuid functions 0x80000002 thru 0x80000004: example, unused |
twisti@1020 | 254 | uint32_t proc_name_0, proc_name_1, proc_name_2, proc_name_3; |
twisti@1020 | 255 | uint32_t proc_name_4, proc_name_5, proc_name_6, proc_name_7; |
twisti@1020 | 256 | uint32_t proc_name_8, proc_name_9, proc_name_10,proc_name_11; |
twisti@1020 | 257 | |
twisti@1020 | 258 | // cpuid function 0x80000005 //AMD L1, Intel reserved |
twisti@1020 | 259 | uint32_t ext_cpuid5_eax; // unused currently |
twisti@1020 | 260 | uint32_t ext_cpuid5_ebx; // reserved |
twisti@1020 | 261 | ExtCpuid5Ex ext_cpuid5_ecx; // L1 data cache info (AMD) |
twisti@1020 | 262 | ExtCpuid5Ex ext_cpuid5_edx; // L1 instruction cache info (AMD) |
twisti@1020 | 263 | |
twisti@1020 | 264 | // cpuid function 0x80000008 |
twisti@1020 | 265 | uint32_t ext_cpuid8_eax; // unused currently |
twisti@1020 | 266 | uint32_t ext_cpuid8_ebx; // reserved |
twisti@1020 | 267 | ExtCpuid8Ecx ext_cpuid8_ecx; |
twisti@1020 | 268 | uint32_t ext_cpuid8_edx; // reserved |
twisti@1020 | 269 | }; |
twisti@1020 | 270 | |
twisti@1020 | 271 | // The actual cpuid info block |
twisti@1020 | 272 | static CpuidInfo _cpuid_info; |
twisti@1020 | 273 | |
twisti@1020 | 274 | // Extractors and predicates |
twisti@1020 | 275 | static uint32_t extended_cpu_family() { |
twisti@1020 | 276 | uint32_t result = _cpuid_info.std_cpuid1_eax.bits.family; |
twisti@1020 | 277 | result += _cpuid_info.std_cpuid1_eax.bits.ext_family; |
twisti@1020 | 278 | return result; |
twisti@1020 | 279 | } |
twisti@1020 | 280 | static uint32_t extended_cpu_model() { |
twisti@1020 | 281 | uint32_t result = _cpuid_info.std_cpuid1_eax.bits.model; |
twisti@1020 | 282 | result |= _cpuid_info.std_cpuid1_eax.bits.ext_model << 4; |
twisti@1020 | 283 | return result; |
twisti@1020 | 284 | } |
twisti@1020 | 285 | static uint32_t cpu_stepping() { |
twisti@1020 | 286 | uint32_t result = _cpuid_info.std_cpuid1_eax.bits.stepping; |
twisti@1020 | 287 | return result; |
twisti@1020 | 288 | } |
twisti@1020 | 289 | static uint logical_processor_count() { |
twisti@1020 | 290 | uint result = threads_per_core(); |
twisti@1020 | 291 | return result; |
twisti@1020 | 292 | } |
twisti@1020 | 293 | static uint32_t feature_flags() { |
twisti@1020 | 294 | uint32_t result = 0; |
twisti@1020 | 295 | if (_cpuid_info.std_cpuid1_edx.bits.cmpxchg8 != 0) |
twisti@1020 | 296 | result |= CPU_CX8; |
twisti@1020 | 297 | if (_cpuid_info.std_cpuid1_edx.bits.cmov != 0) |
twisti@1020 | 298 | result |= CPU_CMOV; |
twisti@2144 | 299 | if (_cpuid_info.std_cpuid1_edx.bits.fxsr != 0 || (is_amd() && |
twisti@2144 | 300 | _cpuid_info.ext_cpuid1_edx.bits.fxsr != 0)) |
twisti@1020 | 301 | result |= CPU_FXSR; |
twisti@1020 | 302 | // HT flag is set for multi-core processors also. |
twisti@1020 | 303 | if (threads_per_core() > 1) |
twisti@1020 | 304 | result |= CPU_HT; |
twisti@2144 | 305 | if (_cpuid_info.std_cpuid1_edx.bits.mmx != 0 || (is_amd() && |
twisti@2144 | 306 | _cpuid_info.ext_cpuid1_edx.bits.mmx != 0)) |
twisti@1020 | 307 | result |= CPU_MMX; |
twisti@1020 | 308 | if (_cpuid_info.std_cpuid1_edx.bits.sse != 0) |
twisti@1020 | 309 | result |= CPU_SSE; |
twisti@1020 | 310 | if (_cpuid_info.std_cpuid1_edx.bits.sse2 != 0) |
twisti@1020 | 311 | result |= CPU_SSE2; |
twisti@1020 | 312 | if (_cpuid_info.std_cpuid1_ecx.bits.sse3 != 0) |
twisti@1020 | 313 | result |= CPU_SSE3; |
twisti@1020 | 314 | if (_cpuid_info.std_cpuid1_ecx.bits.ssse3 != 0) |
twisti@1020 | 315 | result |= CPU_SSSE3; |
twisti@1020 | 316 | if (_cpuid_info.std_cpuid1_ecx.bits.sse4_1 != 0) |
twisti@1020 | 317 | result |= CPU_SSE4_1; |
twisti@1020 | 318 | if (_cpuid_info.std_cpuid1_ecx.bits.sse4_2 != 0) |
twisti@1020 | 319 | result |= CPU_SSE4_2; |
twisti@1078 | 320 | if (_cpuid_info.std_cpuid1_ecx.bits.popcnt != 0) |
twisti@1078 | 321 | result |= CPU_POPCNT; |
twisti@1210 | 322 | |
twisti@1210 | 323 | // AMD features. |
twisti@1210 | 324 | if (is_amd()) { |
twisti@1210 | 325 | if (_cpuid_info.ext_cpuid1_edx.bits.tdnow != 0) |
twisti@1210 | 326 | result |= CPU_3DNOW; |
twisti@1210 | 327 | if (_cpuid_info.ext_cpuid1_ecx.bits.lzcnt != 0) |
twisti@1210 | 328 | result |= CPU_LZCNT; |
twisti@1210 | 329 | if (_cpuid_info.ext_cpuid1_ecx.bits.sse4a != 0) |
twisti@1210 | 330 | result |= CPU_SSE4A; |
twisti@1210 | 331 | } |
twisti@1210 | 332 | |
twisti@1020 | 333 | return result; |
twisti@1020 | 334 | } |
twisti@1020 | 335 | |
twisti@1020 | 336 | static void get_processor_features(); |
twisti@1020 | 337 | |
twisti@1020 | 338 | public: |
twisti@1020 | 339 | // Offsets for cpuid asm stub |
twisti@1020 | 340 | static ByteSize std_cpuid0_offset() { return byte_offset_of(CpuidInfo, std_max_function); } |
twisti@1020 | 341 | static ByteSize std_cpuid1_offset() { return byte_offset_of(CpuidInfo, std_cpuid1_eax); } |
twisti@1020 | 342 | static ByteSize dcp_cpuid4_offset() { return byte_offset_of(CpuidInfo, dcp_cpuid4_eax); } |
twisti@1020 | 343 | static ByteSize ext_cpuid1_offset() { return byte_offset_of(CpuidInfo, ext_cpuid1_eax); } |
twisti@1020 | 344 | static ByteSize ext_cpuid5_offset() { return byte_offset_of(CpuidInfo, ext_cpuid5_eax); } |
twisti@1020 | 345 | static ByteSize ext_cpuid8_offset() { return byte_offset_of(CpuidInfo, ext_cpuid8_eax); } |
kvn@1977 | 346 | static ByteSize tpl_cpuidB0_offset() { return byte_offset_of(CpuidInfo, tpl_cpuidB0_eax); } |
kvn@1977 | 347 | static ByteSize tpl_cpuidB1_offset() { return byte_offset_of(CpuidInfo, tpl_cpuidB1_eax); } |
kvn@1977 | 348 | static ByteSize tpl_cpuidB2_offset() { return byte_offset_of(CpuidInfo, tpl_cpuidB2_eax); } |
twisti@1020 | 349 | |
twisti@1020 | 350 | // Initialization |
twisti@1020 | 351 | static void initialize(); |
twisti@1020 | 352 | |
twisti@1020 | 353 | // Asserts |
twisti@1020 | 354 | static void assert_is_initialized() { |
twisti@1020 | 355 | assert(_cpuid_info.std_cpuid1_eax.bits.family != 0, "VM_Version not initialized"); |
twisti@1020 | 356 | } |
twisti@1020 | 357 | |
twisti@1020 | 358 | // |
twisti@1020 | 359 | // Processor family: |
twisti@1020 | 360 | // 3 - 386 |
twisti@1020 | 361 | // 4 - 486 |
twisti@1020 | 362 | // 5 - Pentium |
twisti@1020 | 363 | // 6 - PentiumPro, Pentium II, Celeron, Xeon, Pentium III, Athlon, |
twisti@1020 | 364 | // Pentium M, Core Solo, Core Duo, Core2 Duo |
twisti@1020 | 365 | // family 6 model: 9, 13, 14, 15 |
twisti@1020 | 366 | // 0x0f - Pentium 4, Opteron |
twisti@1020 | 367 | // |
twisti@1020 | 368 | // Note: The cpu family should be used to select between |
twisti@1020 | 369 | // instruction sequences which are valid on all Intel |
twisti@1020 | 370 | // processors. Use the feature test functions below to |
twisti@1020 | 371 | // determine whether a particular instruction is supported. |
twisti@1020 | 372 | // |
twisti@1020 | 373 | static int cpu_family() { return _cpu;} |
twisti@1020 | 374 | static bool is_P6() { return cpu_family() >= 6; } |
twisti@1020 | 375 | |
twisti@1020 | 376 | static bool is_amd() { assert_is_initialized(); return _cpuid_info.std_vendor_name_0 == 0x68747541; } // 'htuA' |
twisti@1020 | 377 | static bool is_intel() { assert_is_initialized(); return _cpuid_info.std_vendor_name_0 == 0x756e6547; } // 'uneG' |
twisti@1020 | 378 | |
kvn@2002 | 379 | static bool supports_processor_topology() { |
kvn@2002 | 380 | return (_cpuid_info.std_max_function >= 0xB) && |
kvn@2002 | 381 | // eax[4:0] | ebx[0:15] == 0 indicates invalid topology level. |
kvn@2002 | 382 | // Some cpus have max cpuid >= 0xB but do not support processor topology. |
kvn@2002 | 383 | ((_cpuid_info.tpl_cpuidB0_eax & 0x1f | _cpuid_info.tpl_cpuidB0_ebx.bits.logical_cpus) != 0); |
kvn@2002 | 384 | } |
kvn@2002 | 385 | |
twisti@1020 | 386 | static uint cores_per_cpu() { |
twisti@1020 | 387 | uint result = 1; |
twisti@1020 | 388 | if (is_intel()) { |
kvn@2002 | 389 | if (supports_processor_topology()) { |
kvn@1977 | 390 | result = _cpuid_info.tpl_cpuidB1_ebx.bits.logical_cpus / |
kvn@1977 | 391 | _cpuid_info.tpl_cpuidB0_ebx.bits.logical_cpus; |
kvn@1977 | 392 | } else { |
kvn@1977 | 393 | result = (_cpuid_info.dcp_cpuid4_eax.bits.cores_per_cpu + 1); |
kvn@1977 | 394 | } |
twisti@1020 | 395 | } else if (is_amd()) { |
twisti@1020 | 396 | result = (_cpuid_info.ext_cpuid8_ecx.bits.cores_per_cpu + 1); |
twisti@1020 | 397 | } |
twisti@1020 | 398 | return result; |
twisti@1020 | 399 | } |
twisti@1020 | 400 | |
twisti@1020 | 401 | static uint threads_per_core() { |
twisti@1020 | 402 | uint result = 1; |
kvn@2002 | 403 | if (is_intel() && supports_processor_topology()) { |
kvn@1977 | 404 | result = _cpuid_info.tpl_cpuidB0_ebx.bits.logical_cpus; |
kvn@1977 | 405 | } else if (_cpuid_info.std_cpuid1_edx.bits.ht != 0) { |
twisti@1020 | 406 | result = _cpuid_info.std_cpuid1_ebx.bits.threads_per_cpu / |
twisti@1020 | 407 | cores_per_cpu(); |
twisti@1020 | 408 | } |
twisti@1020 | 409 | return result; |
twisti@1020 | 410 | } |
twisti@1020 | 411 | |
twisti@1020 | 412 | static intx L1_data_cache_line_size() { |
twisti@1020 | 413 | intx result = 0; |
twisti@1020 | 414 | if (is_intel()) { |
twisti@1020 | 415 | result = (_cpuid_info.dcp_cpuid4_ebx.bits.L1_line_size + 1); |
twisti@1020 | 416 | } else if (is_amd()) { |
twisti@1020 | 417 | result = _cpuid_info.ext_cpuid5_ecx.bits.L1_line_size; |
twisti@1020 | 418 | } |
twisti@1020 | 419 | if (result < 32) // not defined ? |
twisti@1020 | 420 | result = 32; // 32 bytes by default on x86 and other x64 |
twisti@1020 | 421 | return result; |
twisti@1020 | 422 | } |
twisti@1020 | 423 | |
twisti@1020 | 424 | // |
twisti@1020 | 425 | // Feature identification |
twisti@1020 | 426 | // |
twisti@1020 | 427 | static bool supports_cpuid() { return _cpuFeatures != 0; } |
twisti@1020 | 428 | static bool supports_cmpxchg8() { return (_cpuFeatures & CPU_CX8) != 0; } |
twisti@1020 | 429 | static bool supports_cmov() { return (_cpuFeatures & CPU_CMOV) != 0; } |
twisti@1020 | 430 | static bool supports_fxsr() { return (_cpuFeatures & CPU_FXSR) != 0; } |
twisti@1020 | 431 | static bool supports_ht() { return (_cpuFeatures & CPU_HT) != 0; } |
twisti@1020 | 432 | static bool supports_mmx() { return (_cpuFeatures & CPU_MMX) != 0; } |
twisti@1020 | 433 | static bool supports_sse() { return (_cpuFeatures & CPU_SSE) != 0; } |
twisti@1020 | 434 | static bool supports_sse2() { return (_cpuFeatures & CPU_SSE2) != 0; } |
twisti@1020 | 435 | static bool supports_sse3() { return (_cpuFeatures & CPU_SSE3) != 0; } |
twisti@1020 | 436 | static bool supports_ssse3() { return (_cpuFeatures & CPU_SSSE3)!= 0; } |
twisti@1020 | 437 | static bool supports_sse4_1() { return (_cpuFeatures & CPU_SSE4_1) != 0; } |
twisti@1020 | 438 | static bool supports_sse4_2() { return (_cpuFeatures & CPU_SSE4_2) != 0; } |
twisti@1078 | 439 | static bool supports_popcnt() { return (_cpuFeatures & CPU_POPCNT) != 0; } |
twisti@1020 | 440 | // |
twisti@1020 | 441 | // AMD features |
twisti@1020 | 442 | // |
twisti@1020 | 443 | static bool supports_3dnow() { return (_cpuFeatures & CPU_3DNOW) != 0; } |
twisti@1020 | 444 | static bool supports_mmx_ext() { return is_amd() && _cpuid_info.ext_cpuid1_edx.bits.mmx_amd != 0; } |
twisti@1020 | 445 | static bool supports_3dnow2() { return is_amd() && _cpuid_info.ext_cpuid1_edx.bits.tdnow2 != 0; } |
twisti@1210 | 446 | static bool supports_lzcnt() { return (_cpuFeatures & CPU_LZCNT) != 0; } |
twisti@1020 | 447 | static bool supports_sse4a() { return (_cpuFeatures & CPU_SSE4A) != 0; } |
twisti@1020 | 448 | |
kvn@2269 | 449 | // Intel Core and newer cpus have fast IDIV instruction (excluding Atom). |
kvn@2269 | 450 | static bool has_fast_idiv() { return is_intel() && cpu_family() == 6 && |
kvn@2269 | 451 | supports_sse3() && _model != 0x1C; } |
kvn@2269 | 452 | |
twisti@1020 | 453 | static bool supports_compare_and_exchange() { return true; } |
twisti@1020 | 454 | |
twisti@1020 | 455 | static const char* cpu_features() { return _features_str; } |
twisti@1020 | 456 | |
twisti@1020 | 457 | static intx allocate_prefetch_distance() { |
twisti@1020 | 458 | // This method should be called before allocate_prefetch_style(). |
twisti@1020 | 459 | // |
twisti@1020 | 460 | // Hardware prefetching (distance/size in bytes): |
twisti@1020 | 461 | // Pentium 3 - 64 / 32 |
twisti@1020 | 462 | // Pentium 4 - 256 / 128 |
twisti@1020 | 463 | // Athlon - 64 / 32 ???? |
twisti@1020 | 464 | // Opteron - 128 / 64 only when 2 sequential cache lines accessed |
twisti@1020 | 465 | // Core - 128 / 64 |
twisti@1020 | 466 | // |
twisti@1020 | 467 | // Software prefetching (distance in bytes / instruction with best score): |
twisti@1020 | 468 | // Pentium 3 - 128 / prefetchnta |
twisti@1020 | 469 | // Pentium 4 - 512 / prefetchnta |
twisti@1020 | 470 | // Athlon - 128 / prefetchnta |
twisti@1020 | 471 | // Opteron - 256 / prefetchnta |
twisti@1020 | 472 | // Core - 256 / prefetchnta |
twisti@1020 | 473 | // It will be used only when AllocatePrefetchStyle > 0 |
twisti@1020 | 474 | |
twisti@1020 | 475 | intx count = AllocatePrefetchDistance; |
twisti@1020 | 476 | if (count < 0) { // default ? |
twisti@1020 | 477 | if (is_amd()) { // AMD |
twisti@1020 | 478 | if (supports_sse2()) |
twisti@1020 | 479 | count = 256; // Opteron |
twisti@1020 | 480 | else |
twisti@1020 | 481 | count = 128; // Athlon |
twisti@1020 | 482 | } else { // Intel |
twisti@1020 | 483 | if (supports_sse2()) |
twisti@1020 | 484 | if (cpu_family() == 6) { |
twisti@1020 | 485 | count = 256; // Pentium M, Core, Core2 |
twisti@1020 | 486 | } else { |
twisti@1020 | 487 | count = 512; // Pentium 4 |
twisti@1020 | 488 | } |
twisti@1020 | 489 | else |
twisti@1020 | 490 | count = 128; // Pentium 3 (and all other old CPUs) |
twisti@1020 | 491 | } |
twisti@1020 | 492 | } |
twisti@1020 | 493 | return count; |
twisti@1020 | 494 | } |
twisti@1020 | 495 | static intx allocate_prefetch_style() { |
twisti@1020 | 496 | assert(AllocatePrefetchStyle >= 0, "AllocatePrefetchStyle should be positive"); |
twisti@1020 | 497 | // Return 0 if AllocatePrefetchDistance was not defined. |
twisti@1020 | 498 | return AllocatePrefetchDistance > 0 ? AllocatePrefetchStyle : 0; |
twisti@1020 | 499 | } |
twisti@1020 | 500 | |
twisti@1020 | 501 | // Prefetch interval for gc copy/scan == 9 dcache lines. Derived from |
twisti@1020 | 502 | // 50-warehouse specjbb runs on a 2-way 1.8ghz opteron using a 4gb heap. |
twisti@1020 | 503 | // Tested intervals from 128 to 2048 in increments of 64 == one cache line. |
twisti@1020 | 504 | // 256 bytes (4 dcache lines) was the nearest runner-up to 576. |
twisti@1020 | 505 | |
twisti@1020 | 506 | // gc copy/scan is disabled if prefetchw isn't supported, because |
twisti@1020 | 507 | // Prefetch::write emits an inlined prefetchw on Linux. |
twisti@1020 | 508 | // Do not use the 3dnow prefetchw instruction. It isn't supported on em64t. |
twisti@1020 | 509 | // The used prefetcht0 instruction works for both amd64 and em64t. |
twisti@1020 | 510 | static intx prefetch_copy_interval_in_bytes() { |
twisti@1020 | 511 | intx interval = PrefetchCopyIntervalInBytes; |
twisti@1020 | 512 | return interval >= 0 ? interval : 576; |
twisti@1020 | 513 | } |
twisti@1020 | 514 | static intx prefetch_scan_interval_in_bytes() { |
twisti@1020 | 515 | intx interval = PrefetchScanIntervalInBytes; |
twisti@1020 | 516 | return interval >= 0 ? interval : 576; |
twisti@1020 | 517 | } |
twisti@1020 | 518 | static intx prefetch_fields_ahead() { |
twisti@1020 | 519 | intx count = PrefetchFieldsAhead; |
twisti@1020 | 520 | return count >= 0 ? count : 1; |
twisti@1020 | 521 | } |
twisti@1020 | 522 | }; |