Skip to content

Commit 0b85b8c

Browse files
authored
Merge pull request #9247 from awlauria/osc_pt2pt_changes_v4.1.x
v4.1.x: osc/pt2pt: Some fixes
2 parents f0b8145 + 334049d commit 0b85b8c

File tree

4 files changed

+34
-2
lines changed

4 files changed

+34
-2
lines changed

ompi/mca/osc/pt2pt/osc_pt2pt.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -918,11 +918,14 @@ static inline ompi_osc_pt2pt_sync_t *ompi_osc_pt2pt_module_sync_lookup (ompi_osc
918918

919919
return &module->all_sync;
920920
case OMPI_OSC_PT2PT_SYNC_TYPE_PSCW:
921+
OPAL_THREAD_LOCK(&module->all_sync.lock);
921922
if (ompi_osc_pt2pt_sync_pscw_peer (module, target, peer)) {
923+
OPAL_THREAD_UNLOCK(&module->all_sync.lock);
922924
OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
923925
"osc/pt2pt: found PSCW access epoch target for %d", target));
924926
return &module->all_sync;
925927
}
928+
OPAL_THREAD_UNLOCK(&module->all_sync.lock);
926929
}
927930

928931
return NULL;

ompi/mca/osc/pt2pt/osc_pt2pt_active_target.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -213,11 +213,13 @@ int ompi_osc_pt2pt_start (ompi_group_t *group, int assert, ompi_win_t *win)
213213
ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
214214
ompi_osc_pt2pt_sync_t *sync = &module->all_sync;
215215

216+
OPAL_THREAD_LOCK(&module->lock);
216217
OPAL_THREAD_LOCK(&sync->lock);
217218

218219
/* check if we are already in an access epoch */
219220
if (ompi_osc_pt2pt_access_epoch_active (module)) {
220221
OPAL_THREAD_UNLOCK(&sync->lock);
222+
OPAL_THREAD_UNLOCK(&module->lock);
221223
return OMPI_ERR_RMA_SYNC;
222224
}
223225

@@ -251,6 +253,7 @@ int ompi_osc_pt2pt_start (ompi_group_t *group, int assert, ompi_win_t *win)
251253
/* nothing more to do. this is an empty start epoch */
252254
sync->eager_send_active = true;
253255
OPAL_THREAD_UNLOCK(&sync->lock);
256+
OPAL_THREAD_UNLOCK(&module->lock);
254257
return OMPI_SUCCESS;
255258
}
256259

@@ -260,6 +263,7 @@ int ompi_osc_pt2pt_start (ompi_group_t *group, int assert, ompi_win_t *win)
260263
sync->peer_list.peers = ompi_osc_pt2pt_get_peers (module, group);
261264
if (NULL == sync->peer_list.peers) {
262265
OPAL_THREAD_UNLOCK(&sync->lock);
266+
OPAL_THREAD_UNLOCK(&module->lock);
263267
return OMPI_ERR_OUT_OF_RESOURCE;
264268
}
265269

@@ -295,6 +299,7 @@ int ompi_osc_pt2pt_start (ompi_group_t *group, int assert, ompi_win_t *win)
295299
sync->eager_send_active));
296300

297301
OPAL_THREAD_UNLOCK(&sync->lock);
302+
OPAL_THREAD_UNLOCK(&module->lock);
298303
return OMPI_SUCCESS;
299304
}
300305

ompi/mca/osc/pt2pt/osc_pt2pt_data_move.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -753,7 +753,9 @@ static int ompi_osc_pt2pt_acc_op_queue (ompi_osc_pt2pt_module_t *module, ompi_os
753753
}
754754

755755
/* add to the pending acc queue */
756-
OPAL_THREAD_SCOPED_LOCK(&module->pending_acc_lock, opal_list_append (&module->pending_acc, &pending_acc->super));
756+
ompi_osc_pt2pt_accumulate_lock(module);
757+
opal_list_append (&module->pending_acc, &pending_acc->super);
758+
ompi_osc_pt2pt_accumulate_unlock(module);
757759

758760
return OMPI_SUCCESS;
759761
}

ompi/mca/osc/pt2pt/osc_pt2pt_passive_target.c

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,9 @@ static int ompi_osc_pt2pt_lock_internal (int lock_type, int target, int assert,
272272
ompi_osc_pt2pt_module_t *module = GET_MODULE(win);
273273
ompi_osc_pt2pt_sync_t *lock;
274274
int ret = OMPI_SUCCESS;
275+
ompi_osc_pt2pt_sync_t *otherlock = NULL;
276+
int target_key;
277+
void *iter_hash_node = NULL;
275278

276279
/* Check if no_locks is set. TODO: we also need to track whether we are in an
277280
* active target epoch. Fence can make this tricky to track. */
@@ -345,6 +348,25 @@ static int ompi_osc_pt2pt_lock_internal (int lock_type, int target, int assert,
345348
return OMPI_ERR_RMA_CONFLICT;
346349
}
347350

351+
/* All previously requested locks must be complete before we can start a new
352+
* lock, otherwise we deadlock from mis-ordering of locks.
353+
*/
354+
ret = opal_hash_table_get_first_key_uint32(&module->outstanding_locks,
355+
(uint32_t *) &target_key,
356+
(void **) &otherlock,
357+
&iter_hash_node);
358+
while( OPAL_SUCCESS == ret ) {
359+
if( NULL != otherlock ) {
360+
ompi_osc_pt2pt_sync_wait_expected (otherlock);
361+
}
362+
363+
ret = opal_hash_table_get_next_key_uint32(&module->outstanding_locks,
364+
(uint32_t *) &target_key,
365+
(void **) &otherlock,
366+
iter_hash_node, &iter_hash_node);
367+
}
368+
ret = OPAL_SUCCESS;
369+
348370
++module->passive_target_access_epoch;
349371

350372
ompi_osc_pt2pt_module_lock_insert (module, lock);
@@ -596,7 +618,7 @@ int ompi_osc_pt2pt_flush_all (struct ompi_win_t *win)
596618
}
597619

598620
ret = opal_hash_table_get_next_key_uint32 (&module->outstanding_locks, (uint32_t *) &target,
599-
(void **) lock, node, &node);
621+
(void **) &lock, node, &node);
600622
if (OPAL_SUCCESS != ret) {
601623
ret = OPAL_SUCCESS;
602624
break;

0 commit comments

Comments
 (0)