Add multiple thread support for DataChannels

VirrageS · apaszke · commit a17d96d57115 · 2017-05-01T01:49:10.000-07:00
Previously, when using same data channel in multiple thread environment,
one didn't have any guarantee that there won't be any deadlocks
or even errors.
diff --git a/torch/lib/THD/base/data_channels/DataChannelGloo.cpp b/torch/lib/THD/base/data_channels/DataChannelGloo.cpp
@@ -138,7 +138,11 @@ void DataChannelGloo::allGatherT(std::vector<thpp::Tensor*>& output,
     group_id, _groups.at(group_id), tensor_bytes, all_tensor_bytes, input.numel());
 
   std::memcpy(std::get<1>(ret).get(), input.data(), tensor_bytes);
-  std::get<0>(ret)->run();
+
+  {
+    std::lock_guard<std::mutex> lock(*std::get<3>(ret));
+    std::get<0>(ret)->run();
+  }
 
   for (std::size_t i = 0; i < output.size(); i++) {
     std::memcpy(output.at(i)->data(),
@@ -185,7 +189,10 @@ void DataChannelGloo::allReduceT(thpp::Tensor& t, THDReduceOp operation,
     group_id, _groups.at(group_id), tensor_bytes, t.numel(), operation);
 
   std::memcpy(std::get<1>(ret).get(), t.data(), tensor_bytes);
-  std::get<0>(ret)->run();
+  {
+    std::lock_guard<std::mutex> lock(*std::get<3>(ret));
+    std::get<0>(ret)->run();
+  }
   std::memcpy(t.data(), std::get<2>(ret).get(), tensor_bytes);
 }
 
@@ -214,7 +221,10 @@ void DataChannelGloo::broadcastT(thpp::Tensor& data, rank_type src_rank,
   if (_rank == src_rank)
     std::memcpy(std::get<1>(ret).get(), data.data(), tensor_bytes);
 
-  std::get<0>(ret)->run();
+  {
+    std::lock_guard<std::mutex> lock(*std::get<3>(ret));
+    std::get<0>(ret)->run();
+  }
 
   if (_rank != src_rank)
     std::memcpy(data.data(), std::get<2>(ret).get(), tensor_bytes);
@@ -267,7 +277,10 @@ void DataChannelGloo::barrier(THDGroup group_id) {
   RETURN_IF_NOT_IN_GROUP
   auto ret = _cache->getAlgorithm<CollectiveType::BARRIER, void>(
     group_id, _groups.at(group_id));
-  std::get<0>(ret)->run();
+  {
+    std::lock_guard<std::mutex> lock(*std::get<3>(ret));
+    std::get<0>(ret)->run();
+  }
 }
 
 
diff --git a/torch/lib/THD/base/data_channels/DataChannelMPI.cpp b/torch/lib/THD/base/data_channels/DataChannelMPI.cpp
@@ -92,7 +92,13 @@ DataChannelMPI::~DataChannelMPI() {
 
 
 bool DataChannelMPI::init() {
-  MPI_Init(NULL, NULL);
+  int* provided = NULL;
+  MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, provided);
+  if (*provided != MPI_THREAD_MULTIPLE) {
+    std::cerr << "MPI implementation does not support multiple threads."
+              << "Using same data channel in multiple thread can result in"
+              << "wrong results or errors." << std::endl;
+  }
 
   int rank, num_processes;
   MPI_Comm_size(MPI_COMM_WORLD, &num_processes);
diff --git a/torch/lib/THD/base/data_channels/DataChannelTCP.cpp b/torch/lib/THD/base/data_channels/DataChannelTCP.cpp
@@ -293,6 +293,8 @@ void DataChannelTCP::allGather(std::vector<thpp::Tensor*>& output,
    * efficient also for small data (< 512 KB).
    */
 
+  std::lock_guard<std::mutex> lock(_mutex);
+
   const auto& group = _groups.at(group_id);
   rank_type group_rank;
   bool exists;
@@ -325,6 +327,8 @@ void DataChannelTCP::allGather(std::vector<thpp::Tensor*>& output,
 
 void DataChannelTCP::gather(std::vector<thpp::Tensor*>& output,
                             thpp::Tensor& input, rank_type dst_rank, THDGroup group_id) {
+  std::lock_guard<std::mutex> lock(_mutex);
+
   const auto& group = _groups.at(group_id);
   bool exists;
 
@@ -358,6 +362,8 @@ void DataChannelTCP::gather(std::vector<thpp::Tensor*>& output,
 void DataChannelTCP::scatter(std::vector<thpp::Tensor*>& input,
                              thpp::Tensor& output, rank_type src_rank,
                              THDGroup group_id) {
+  std::lock_guard<std::mutex> lock(_mutex);
+
   const auto& group = _groups.at(group_id);
   bool exists;
 
@@ -404,6 +410,8 @@ void DataChannelTCP::allReduce(thpp::Tensor& data, THDReduceOp operation,
    *   > https://github.com/pmodels/mpich/blob/master/src/mpi/coll/allreduce.c
    */
 
+  std::lock_guard<std::mutex> lock(_mutex);
+
   const auto& group = _groups.at(group_id);
   rank_type group_rank;
   bool exists;
@@ -471,6 +479,8 @@ void DataChannelTCP::reduce(thpp::Tensor& data, THDReduceOp operation,
    * order and direction of communication.
    */
 
+  std::lock_guard<std::mutex> lock(_mutex);
+
   const auto& group = _groups.at(group_id);
   rank_type group_rank;
   bool exists;
@@ -518,6 +528,8 @@ void DataChannelTCP::broadcast(thpp::Tensor& data, rank_type src_rank,
    * virtual ones where `virtual_rank` for `src_rank` is 0.
    */
 
+  std::lock_guard<std::mutex> lock(_mutex);
+
   const auto& group = _groups.at(group_id);
   rank_type group_rank;
   bool exists;
@@ -644,6 +656,8 @@ void DataChannelTCP::barrier(THDGroup group_id) {
    * we do recv asynchronously (thread), send byte and then wait for recv to complete.
    */
 
+  std::lock_guard<std::mutex> lock(_mutex);
+
   const auto& group = _groups.at(group_id);
   rank_type group_rank;
   bool exists;
diff --git a/torch/lib/THD/base/data_channels/DataChannelTCP.hpp b/torch/lib/THD/base/data_channels/DataChannelTCP.hpp
@@ -87,6 +87,8 @@ struct DataChannelTCP : DataChannel {
   std::vector<Process> _processes; // Other processes in network
   std::unique_ptr<struct pollfd[]> _poll_events; // Events array for `poll`
 
+  std::mutex _mutex; // General mutex for methods - to make methods run atomically.
+
   // Existing groups of processes and corresponding group ids
   std::unordered_map<THDGroup, DataChannel::Group> _groups;
 
diff --git a/torch/lib/THD/base/data_channels/GlooCache.hpp b/torch/lib/THD/base/data_channels/GlooCache.hpp
@@ -58,7 +58,8 @@ struct GlooCache {
   using value_type = std::tuple<
     std::shared_ptr<algorithm_type>, // algorithm
     std::shared_ptr<buffer_type>,    // input buffer (nullptr if not used)
-    std::shared_ptr<buffer_type>     // output buffer (nullptr if not used)
+    std::shared_ptr<buffer_type>,    // output buffer (nullptr if not used)
+    std::shared_ptr<std::mutex>      // mutex to make algorithms run atomically
   >;
 
   GlooCache(rank_type rank, std::shared_ptr<::gloo::transport::Device> device,
@@ -88,6 +89,10 @@ struct GlooCache {
   template<CollectiveType D, typename T, typename... Args>
   value_type getAlgorithm(THDGroup group_id, const DataChannel::Group& group,
                           Args... args) {
+    // We need to protect from race when two (or more) threads are trying to
+    // create same algorithm simultaneously.
+    std::lock_guard<std::mutex> lock(_mutex);
+    
     auto key = algorithm_spec<D, T>::key(group_id, args...);
     auto it = _algorithms.find(key);
     if (it == _algorithms.end()) {
@@ -116,6 +121,8 @@ struct GlooCache {
   std::shared_ptr<::gloo::transport::Device> _device;
   std::shared_ptr<store_type> _store;
 
+  std::mutex _mutex;
+
   std::unordered_map<key_type, value_type> _algorithms;
 };
 
@@ -164,7 +171,8 @@ struct algorithm_spec<CollectiveType::ALL_GATHER, T> {
         reinterpret_cast<T*>(output_buffer.get()),
         count),
       input_buffer,
-      output_buffer
+      output_buffer,
+      std::make_shared<std::mutex>()
     );
   }
 };
@@ -192,7 +200,8 @@ struct algorithm_spec<CollectiveType::ALL_REDUCE, T> {
         count,
         THDToGlooReduceOp<T>(op)),
       input_buffer,
-      input_buffer // we get the result in same buffer
+      input_buffer, // we get the result in same buffer
+      std::make_shared<std::mutex>()
     );
   }
 };
@@ -220,7 +229,8 @@ struct algorithm_spec<CollectiveType::BROADCAST, T> {
         count,
         src_rank),
       input_buffer,
-      input_buffer // we get the result in same buffer
+      input_buffer, // we get the result in same buffer
+      std::make_shared<std::mutex>()
     );
   }
 };
@@ -239,7 +249,8 @@ struct algorithm_spec<CollectiveType::BARRIER, T> {
     return std::make_tuple(
       std::make_shared<::gloo::BarrierAllToAll>(context),
       nullptr,
-      nullptr
+      nullptr,
+      std::make_shared<std::mutex>()
     );
   }
 };