Last active
July 14, 2018 04:21
-
-
Save crmne/474b44eca214e1c8238f52b21d209dcb to your computer and use it in GitHub Desktop.
Patches for TensorFlow 1.4.0 for macOS CUDA support. For post http://paolino.me/tutorial/tensorflow/machine-learning/deep-learning/gpu/2017/11/18/installing-tensorflow-1.4.0-macos-cuda/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc | |
index 0f7adaf24a..355584456b 100644 | |
--- a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc | |
+++ b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc | |
@@ -69,7 +69,7 @@ __global__ void concat_variable_kernel( | |
IntType num_inputs = input_ptr_data.size; | |
// verbose declaration needed due to template | |
- extern __shared__ __align__(sizeof(T)) unsigned char smem[]; | |
+ extern __shared__ unsigned char smem[]; | |
IntType* smem_col_scan = reinterpret_cast<IntType*>(smem); | |
if (useSmem) { | |
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc | |
index ecfe51d599..3c2efb8a73 100644 | |
--- a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc | |
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc | |
@@ -163,7 +163,7 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNHWCSmall( | |
const DepthwiseArgs args, const T* input, const T* filter, T* output) { | |
assert(CanLaunchDepthwiseConv2dGPUSmall(args)); | |
// Holds block plus halo and filter data for blockDim.x depths. | |
- extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[]; | |
+ extern __shared__ unsigned char shared_memory[]; | |
T* const shared_data = reinterpret_cast<T*>(shared_memory); | |
const int batches = args.batch; | |
@@ -433,7 +433,7 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNCHWSmall( | |
const DepthwiseArgs args, const T* input, const T* filter, T* output) { | |
assert(CanLaunchDepthwiseConv2dGPUSmall(args)); | |
// Holds block plus halo and filter data for blockDim.z depths. | |
- extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[]; | |
+ extern __shared__ unsigned char shared_memory[]; | |
T* const shared_data = reinterpret_cast<T*>(shared_memory); | |
const int batches = args.batch; | |
@@ -1051,7 +1051,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall( | |
const DepthwiseArgs args, const T* output, const T* input, T* filter) { | |
assert(CanLaunchDepthwiseConv2dBackpropFilterGPUSmall(args, blockDim.z)); | |
// Holds block plus halo and filter data for blockDim.x depths. | |
- extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[]; | |
+ extern __shared__ unsigned char shared_memory[]; | |
T* const shared_data = reinterpret_cast<T*>(shared_memory); | |
const int batches = args.batch; | |
@@ -1310,7 +1310,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall( | |
const DepthwiseArgs args, const T* output, const T* input, T* filter) { | |
assert(CanLaunchDepthwiseConv2dBackpropFilterGPUSmall(args, blockDim.x)); | |
// Holds block plus halo and filter data for blockDim.z depths. | |
- extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[]; | |
+ extern __shared__ unsigned char shared_memory[]; | |
T* const shared_data = reinterpret_cast<T*>(shared_memory); | |
const int batches = args.batch; | |
diff --git a/tensorflow/core/kernels/split_lib_gpu.cu.cc b/tensorflow/core/kernels/split_lib_gpu.cu.cc | |
index dd6fc6115f..6a72c9a2fc 100644 | |
--- a/tensorflow/core/kernels/split_lib_gpu.cu.cc | |
+++ b/tensorflow/core/kernels/split_lib_gpu.cu.cc | |
@@ -119,7 +119,7 @@ __global__ void split_v_kernel(const T* input_ptr, | |
int num_outputs = output_ptr_data.size; | |
// verbose declaration needed due to template | |
- extern __shared__ __align__(sizeof(T)) unsigned char smem[]; | |
+ extern __shared__ unsigned char smem[]; | |
IntType* smem_col_scan = reinterpret_cast<IntType*>(smem); | |
if (useSmem) { |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/third_party/gpus/cuda/BUILD.tpl b/third_party/gpus/cuda/BUILD.tpl | |
index b752734a08..3f33af2684 100644 | |
--- a/third_party/gpus/cuda/BUILD.tpl | |
+++ b/third_party/gpus/cuda/BUILD.tpl | |
@@ -109,7 +109,7 @@ cc_library( | |
".", | |
"cuda/include", | |
], | |
- linkopts = ["-lgomp"], | |
+ linkopts = ["-L/usr/local/lib/gcc/7"], | |
linkstatic = 1, | |
visibility = ["//visibility:public"], | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I've had better luck adding "/usr/local/lib/gcc/4.9" to the LD_LIBRARY_PATH then changing line 9 in tensorflow140_mac1013_cuda_libgompgcc7.patch to:
' linkopts = ["-llibgomp"],'