atomicops_internals_arm_gcc.h 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294
  1. // Copyright 2013 The Chromium Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style license that can be
  3. // found in the LICENSE file.
  4. // This file is an internal atomic implementation, use butil/atomicops.h instead.
  5. //
  6. // LinuxKernelCmpxchg and Barrier_AtomicIncrement are from Google Gears.
  7. #ifndef BUTIL_ATOMICOPS_INTERNALS_ARM_GCC_H_
  8. #define BUTIL_ATOMICOPS_INTERNALS_ARM_GCC_H_
  9. #if defined(OS_QNX)
  10. #include <sys/cpuinline.h>
  11. #endif
  12. namespace butil {
  13. namespace subtle {
  14. // Memory barriers on ARM are funky, but the kernel is here to help:
  15. //
  16. // * ARMv5 didn't support SMP, there is no memory barrier instruction at
  17. // all on this architecture, or when targeting its machine code.
  18. //
  19. // * Some ARMv6 CPUs support SMP. A full memory barrier can be produced by
  20. // writing a random value to a very specific coprocessor register.
  21. //
  22. // * On ARMv7, the "dmb" instruction is used to perform a full memory
  23. // barrier (though writing to the co-processor will still work).
  24. // However, on single core devices (e.g. Nexus One, or Nexus S),
  25. // this instruction will take up to 200 ns, which is huge, even though
  26. // it's completely un-needed on these devices.
  27. //
  28. // * There is no easy way to determine at runtime if the device is
  29. // single or multi-core. However, the kernel provides a useful helper
  30. // function at a fixed memory address (0xffff0fa0), which will always
  31. // perform a memory barrier in the most efficient way. I.e. on single
  32. // core devices, this is an empty function that exits immediately.
  33. // On multi-core devices, it implements a full memory barrier.
  34. //
  35. // * This source could be compiled to ARMv5 machine code that runs on a
  36. // multi-core ARMv6 or ARMv7 device. In this case, memory barriers
  37. // are needed for correct execution. Always call the kernel helper, even
  38. // when targeting ARMv5TE.
  39. //
  40. inline void MemoryBarrier() {
  41. #if defined(OS_LINUX) || defined(OS_ANDROID)
  42. // Note: This is a function call, which is also an implicit compiler barrier.
  43. typedef void (*KernelMemoryBarrierFunc)();
  44. ((KernelMemoryBarrierFunc)0xffff0fa0)();
  45. #elif defined(OS_QNX)
  46. __cpu_membarrier();
  47. #else
  48. #error MemoryBarrier() is not implemented on this platform.
  49. #endif
  50. }
  51. // An ARM toolchain would only define one of these depending on which
  52. // variant of the target architecture is being used. This tests against
  53. // any known ARMv6 or ARMv7 variant, where it is possible to directly
  54. // use ldrex/strex instructions to implement fast atomic operations.
  55. #if defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \
  56. defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || \
  57. defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \
  58. defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || \
  59. defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__)
  60. inline Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32* ptr,
  61. Atomic32 old_value,
  62. Atomic32 new_value) {
  63. Atomic32 prev_value;
  64. int reloop;
  65. do {
  66. // The following is equivalent to:
  67. //
  68. // prev_value = LDREX(ptr)
  69. // reloop = 0
  70. // if (prev_value != old_value)
  71. // reloop = STREX(ptr, new_value)
  72. __asm__ __volatile__(" ldrex %0, [%3]\n"
  73. " mov %1, #0\n"
  74. " cmp %0, %4\n"
  75. #ifdef __thumb2__
  76. " it eq\n"
  77. #endif
  78. " strexeq %1, %5, [%3]\n"
  79. : "=&r"(prev_value), "=&r"(reloop), "+m"(*ptr)
  80. : "r"(ptr), "r"(old_value), "r"(new_value)
  81. : "cc", "memory");
  82. } while (reloop != 0);
  83. return prev_value;
  84. }
  85. inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr,
  86. Atomic32 old_value,
  87. Atomic32 new_value) {
  88. Atomic32 result = NoBarrier_CompareAndSwap(ptr, old_value, new_value);
  89. MemoryBarrier();
  90. return result;
  91. }
  92. inline Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr,
  93. Atomic32 old_value,
  94. Atomic32 new_value) {
  95. MemoryBarrier();
  96. return NoBarrier_CompareAndSwap(ptr, old_value, new_value);
  97. }
  98. inline Atomic32 NoBarrier_AtomicIncrement(volatile Atomic32* ptr,
  99. Atomic32 increment) {
  100. Atomic32 value;
  101. int reloop;
  102. do {
  103. // Equivalent to:
  104. //
  105. // value = LDREX(ptr)
  106. // value += increment
  107. // reloop = STREX(ptr, value)
  108. //
  109. __asm__ __volatile__(" ldrex %0, [%3]\n"
  110. " add %0, %0, %4\n"
  111. " strex %1, %0, [%3]\n"
  112. : "=&r"(value), "=&r"(reloop), "+m"(*ptr)
  113. : "r"(ptr), "r"(increment)
  114. : "cc", "memory");
  115. } while (reloop);
  116. return value;
  117. }
  118. inline Atomic32 Barrier_AtomicIncrement(volatile Atomic32* ptr,
  119. Atomic32 increment) {
  120. // TODO(digit): Investigate if it's possible to implement this with
  121. // a single MemoryBarrier() operation between the LDREX and STREX.
  122. // See http://crbug.com/246514
  123. MemoryBarrier();
  124. Atomic32 result = NoBarrier_AtomicIncrement(ptr, increment);
  125. MemoryBarrier();
  126. return result;
  127. }
  128. inline Atomic32 NoBarrier_AtomicExchange(volatile Atomic32* ptr,
  129. Atomic32 new_value) {
  130. Atomic32 old_value;
  131. int reloop;
  132. do {
  133. // old_value = LDREX(ptr)
  134. // reloop = STREX(ptr, new_value)
  135. __asm__ __volatile__(" ldrex %0, [%3]\n"
  136. " strex %1, %4, [%3]\n"
  137. : "=&r"(old_value), "=&r"(reloop), "+m"(*ptr)
  138. : "r"(ptr), "r"(new_value)
  139. : "cc", "memory");
  140. } while (reloop != 0);
  141. return old_value;
  142. }
  143. // This tests against any known ARMv5 variant.
  144. #elif defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) || \
  145. defined(__ARM_ARCH_5TE__) || defined(__ARM_ARCH_5TEJ__)
  146. // The kernel also provides a helper function to perform an atomic
  147. // compare-and-swap operation at the hard-wired address 0xffff0fc0.
  148. // On ARMv5, this is implemented by a special code path that the kernel
  149. // detects and treats specially when thread pre-emption happens.
  150. // On ARMv6 and higher, it uses LDREX/STREX instructions instead.
  151. //
  152. // Note that this always perform a full memory barrier, there is no
  153. // need to add calls MemoryBarrier() before or after it. It also
  154. // returns 0 on success, and 1 on exit.
  155. //
  156. // Available and reliable since Linux 2.6.24. Both Android and ChromeOS
  157. // use newer kernel revisions, so this should not be a concern.
  158. namespace {
  159. inline int LinuxKernelCmpxchg(Atomic32 old_value,
  160. Atomic32 new_value,
  161. volatile Atomic32* ptr) {
  162. typedef int (*KernelCmpxchgFunc)(Atomic32, Atomic32, volatile Atomic32*);
  163. return ((KernelCmpxchgFunc)0xffff0fc0)(old_value, new_value, ptr);
  164. }
  165. } // namespace
  166. inline Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32* ptr,
  167. Atomic32 old_value,
  168. Atomic32 new_value) {
  169. Atomic32 prev_value;
  170. for (;;) {
  171. prev_value = *ptr;
  172. if (prev_value != old_value)
  173. return prev_value;
  174. if (!LinuxKernelCmpxchg(old_value, new_value, ptr))
  175. return old_value;
  176. }
  177. }
  178. inline Atomic32 NoBarrier_AtomicExchange(volatile Atomic32* ptr,
  179. Atomic32 new_value) {
  180. Atomic32 old_value;
  181. do {
  182. old_value = *ptr;
  183. } while (LinuxKernelCmpxchg(old_value, new_value, ptr));
  184. return old_value;
  185. }
  186. inline Atomic32 NoBarrier_AtomicIncrement(volatile Atomic32* ptr,
  187. Atomic32 increment) {
  188. return Barrier_AtomicIncrement(ptr, increment);
  189. }
  190. inline Atomic32 Barrier_AtomicIncrement(volatile Atomic32* ptr,
  191. Atomic32 increment) {
  192. for (;;) {
  193. // Atomic exchange the old value with an incremented one.
  194. Atomic32 old_value = *ptr;
  195. Atomic32 new_value = old_value + increment;
  196. if (!LinuxKernelCmpxchg(old_value, new_value, ptr)) {
  197. // The exchange took place as expected.
  198. return new_value;
  199. }
  200. // Otherwise, *ptr changed mid-loop and we need to retry.
  201. }
  202. }
  203. inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr,
  204. Atomic32 old_value,
  205. Atomic32 new_value) {
  206. Atomic32 prev_value;
  207. for (;;) {
  208. prev_value = *ptr;
  209. if (prev_value != old_value) {
  210. // Always ensure acquire semantics.
  211. MemoryBarrier();
  212. return prev_value;
  213. }
  214. if (!LinuxKernelCmpxchg(old_value, new_value, ptr))
  215. return old_value;
  216. }
  217. }
  218. inline Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr,
  219. Atomic32 old_value,
  220. Atomic32 new_value) {
  221. // This could be implemented as:
  222. // MemoryBarrier();
  223. // return NoBarrier_CompareAndSwap();
  224. //
  225. // But would use 3 barriers per succesful CAS. To save performance,
  226. // use Acquire_CompareAndSwap(). Its implementation guarantees that:
  227. // - A succesful swap uses only 2 barriers (in the kernel helper).
  228. // - An early return due to (prev_value != old_value) performs
  229. // a memory barrier with no store, which is equivalent to the
  230. // generic implementation above.
  231. return Acquire_CompareAndSwap(ptr, old_value, new_value);
  232. }
  233. #else
  234. # error "Your CPU's ARM architecture is not supported yet"
  235. #endif
  236. // NOTE: Atomicity of the following load and store operations is only
  237. // guaranteed in case of 32-bit alignement of |ptr| values.
  238. inline void NoBarrier_Store(volatile Atomic32* ptr, Atomic32 value) {
  239. *ptr = value;
  240. }
  241. inline void Acquire_Store(volatile Atomic32* ptr, Atomic32 value) {
  242. *ptr = value;
  243. MemoryBarrier();
  244. }
  245. inline void Release_Store(volatile Atomic32* ptr, Atomic32 value) {
  246. MemoryBarrier();
  247. *ptr = value;
  248. }
  249. inline Atomic32 NoBarrier_Load(volatile const Atomic32* ptr) { return *ptr; }
  250. inline Atomic32 Acquire_Load(volatile const Atomic32* ptr) {
  251. Atomic32 value = *ptr;
  252. MemoryBarrier();
  253. return value;
  254. }
  255. inline Atomic32 Release_Load(volatile const Atomic32* ptr) {
  256. MemoryBarrier();
  257. return *ptr;
  258. }
  259. } // namespace butil::subtle
  260. } // namespace butil
  261. #endif // BUTIL_ATOMICOPS_INTERNALS_ARM_GCC_H_