Skip to content

Instantly share code, notes, and snippets.

@crmne
Last active July 14, 2018 04:21
Show Gist options
  • Save crmne/474b44eca214e1c8238f52b21d209dcb to your computer and use it in GitHub Desktop.
Save crmne/474b44eca214e1c8238f52b21d209dcb to your computer and use it in GitHub Desktop.
diff --git a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
index 0f7adaf24a..355584456b 100644
--- a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
+++ b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
@@ -69,7 +69,7 @@ __global__ void concat_variable_kernel(
IntType num_inputs = input_ptr_data.size;
// verbose declaration needed due to template
- extern __shared__ __align__(sizeof(T)) unsigned char smem[];
+ extern __shared__ unsigned char smem[];
IntType* smem_col_scan = reinterpret_cast<IntType*>(smem);
if (useSmem) {
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
index ecfe51d599..3c2efb8a73 100644
--- a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
@@ -163,7 +163,7 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNHWCSmall(
const DepthwiseArgs args, const T* input, const T* filter, T* output) {
assert(CanLaunchDepthwiseConv2dGPUSmall(args));
// Holds block plus halo and filter data for blockDim.x depths.
- extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[];
+ extern __shared__ unsigned char shared_memory[];
T* const shared_data = reinterpret_cast<T*>(shared_memory);
const int batches = args.batch;
@@ -433,7 +433,7 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNCHWSmall(
const DepthwiseArgs args, const T* input, const T* filter, T* output) {
assert(CanLaunchDepthwiseConv2dGPUSmall(args));
// Holds block plus halo and filter data for blockDim.z depths.
- extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[];
+ extern __shared__ unsigned char shared_memory[];
T* const shared_data = reinterpret_cast<T*>(shared_memory);
const int batches = args.batch;
@@ -1051,7 +1051,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall(
const DepthwiseArgs args, const T* output, const T* input, T* filter) {
assert(CanLaunchDepthwiseConv2dBackpropFilterGPUSmall(args, blockDim.z));
// Holds block plus halo and filter data for blockDim.x depths.
- extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[];
+ extern __shared__ unsigned char shared_memory[];
T* const shared_data = reinterpret_cast<T*>(shared_memory);
const int batches = args.batch;
@@ -1310,7 +1310,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall(
const DepthwiseArgs args, const T* output, const T* input, T* filter) {
assert(CanLaunchDepthwiseConv2dBackpropFilterGPUSmall(args, blockDim.x));
// Holds block plus halo and filter data for blockDim.z depths.
- extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[];
+ extern __shared__ unsigned char shared_memory[];
T* const shared_data = reinterpret_cast<T*>(shared_memory);
const int batches = args.batch;
diff --git a/tensorflow/core/kernels/split_lib_gpu.cu.cc b/tensorflow/core/kernels/split_lib_gpu.cu.cc
index dd6fc6115f..6a72c9a2fc 100644
--- a/tensorflow/core/kernels/split_lib_gpu.cu.cc
+++ b/tensorflow/core/kernels/split_lib_gpu.cu.cc
@@ -119,7 +119,7 @@ __global__ void split_v_kernel(const T* input_ptr,
int num_outputs = output_ptr_data.size;
// verbose declaration needed due to template
- extern __shared__ __align__(sizeof(T)) unsigned char smem[];
+ extern __shared__ unsigned char smem[];
IntType* smem_col_scan = reinterpret_cast<IntType*>(smem);
if (useSmem) {
diff --git a/third_party/gpus/cuda/BUILD.tpl b/third_party/gpus/cuda/BUILD.tpl
index b752734a08..3f33af2684 100644
--- a/third_party/gpus/cuda/BUILD.tpl
+++ b/third_party/gpus/cuda/BUILD.tpl
@@ -109,7 +109,7 @@ cc_library(
".",
"cuda/include",
],
- linkopts = ["-lgomp"],
+ linkopts = ["-L/usr/local/lib/gcc/7"],
linkstatic = 1,
visibility = ["//visibility:public"],
)
@iRonJ
Copy link

iRonJ commented Jul 14, 2018

I've had better luck adding "/usr/local/lib/gcc/4.9" to the LD_LIBRARY_PATH then changing line 9 in tensorflow140_mac1013_cuda_libgompgcc7.patch to:
' linkopts = ["-llibgomp"],'

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment