Skip to content

Commit 3f43e2c

Browse files
authored
Merge pull request #4419 from thananon/pr_newob1
pml/ob1: match callback will now queue wrong sequence frag and return.
2 parents c3ac3f7 + 409638b commit 3f43e2c

File tree

1 file changed

+183
-87
lines changed

1 file changed

+183
-87
lines changed

ompi/mca/pml/ob1/pml_ob1_recvfrag.c

Lines changed: 183 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
44
* University Research and Technology
55
* Corporation. All rights reserved.
6-
* Copyright (c) 2004-2016 The University of Tennessee and The University
6+
* Copyright (c) 2004-2017 The University of Tennessee and The University
77
* of Tennessee Research Foundation. All rights
88
* reserved.
99
* Copyright (c) 2004-2007 High Performance Computing Center Stuttgart,
@@ -66,7 +66,7 @@ OBJ_CLASS_INSTANCE( mca_pml_ob1_recv_frag_t,
6666
*/
6767

6868
/**
69-
* Append a unexpected descriptor to a queue. This function will allocate and
69+
* Append an unexpected descriptor to a queue. This function will allocate and
7070
* initialize the fragment (if necessary) and then will add it to the specified
7171
* queue. The allocated fragment is not returned to the caller.
7272
*/
@@ -82,21 +82,92 @@ append_frag_to_list(opal_list_t *queue, mca_btl_base_module_t *btl,
8282
opal_list_append(queue, (opal_list_item_t*)frag);
8383
}
8484

85+
/**
86+
* Append an unexpected descriptor to an ordered queue. This function will allocate and
87+
* initialize the fragment (if necessary) and then will add it to the specified
88+
* queue respecting the sequence number. The allocated fragment is not returned to the caller.
89+
*/
90+
static void
91+
append_frag_to_ordered_list(opal_list_t* queue, mca_btl_base_module_t *btl,
92+
mca_pml_ob1_match_hdr_t *hdr, mca_btl_base_segment_t* segments,
93+
size_t num_segments, mca_pml_ob1_recv_frag_t* frag)
94+
{
95+
mca_pml_ob1_recv_frag_t* tmpfrag;
96+
mca_pml_ob1_match_hdr_t* tmphdr;
97+
98+
if(NULL == frag) {
99+
MCA_PML_OB1_RECV_FRAG_ALLOC(frag);
100+
MCA_PML_OB1_RECV_FRAG_INIT(frag, hdr, segments, num_segments, btl);
101+
}
102+
103+
if( opal_list_is_empty(queue) ) { /* no pending fragments yet */
104+
opal_list_append(queue, (opal_list_item_t*)frag);
105+
return;
106+
}
107+
/* Shortcut for sequence number earlier than the first fragment in the list */
108+
tmpfrag = (mca_pml_ob1_recv_frag_t*)opal_list_get_first(queue);
109+
tmphdr = &tmpfrag->hdr.hdr_match;
110+
assert(hdr->hdr_seq != tmphdr->hdr_seq);
111+
if( hdr->hdr_seq < tmphdr->hdr_seq ) {
112+
opal_list_prepend(queue, (opal_list_item_t*)frag);
113+
return;
114+
}
115+
/* Shortcut for sequence number later than the last fragment in the list */
116+
tmpfrag = (mca_pml_ob1_recv_frag_t*)opal_list_get_last(queue);
117+
tmphdr = &tmpfrag->hdr.hdr_match;
118+
if( hdr->hdr_seq > tmphdr->hdr_seq ) {
119+
opal_list_append(queue, (opal_list_item_t*)frag);
120+
return;
121+
}
122+
/* For all other cases (sequence number missing in the list) */
123+
OPAL_LIST_FOREACH(tmpfrag, queue, mca_pml_ob1_recv_frag_t) {
124+
tmphdr = &tmpfrag->hdr.hdr_match;
125+
if( hdr->hdr_seq < tmphdr->hdr_seq ) {
126+
opal_list_insert_pos(queue, (opal_list_item_t*)tmpfrag,
127+
(opal_list_item_t*) frag);
128+
return;
129+
}
130+
}
131+
}
132+
85133
/**
86134
* Match incoming recv_frags against posted receives.
87135
* Supports out of order delivery.
88136
*
89-
* @param frag_header (IN) Header of received recv_frag.
90-
* @param frag_desc (IN) Received recv_frag descriptor.
91-
* @param match_made (OUT) Flag indicating wether a match was made.
92-
* @param additional_matches (OUT) List of additional matches
137+
* @param hdr (IN) Header of received recv_frag.
138+
* @param segments (IN) Received recv_frag descriptor.
139+
* @param num_segments (IN) Flag indicating wether a match was made.
140+
* @param type (IN) Type of the message header.
93141
* @return OMPI_SUCCESS or error status on failure.
94142
*/
95143
static int mca_pml_ob1_recv_frag_match( mca_btl_base_module_t *btl,
96144
mca_pml_ob1_match_hdr_t *hdr,
97145
mca_btl_base_segment_t* segments,
98146
size_t num_segments,
99-
int type);
147+
int type );
148+
149+
/**
150+
* Match incoming frags against posted receives. If frag is not NULL then we assume
151+
* it is already local and that it can be released upon completion.
152+
* Supports out of order delivery.
153+
*
154+
* @param comm_ptr (IN) Communicator where the message has been received
155+
* @param proc (IN) Proc for which we have received the message.
156+
* @param hdr (IN) Header of received recv_frag.
157+
* @param segments (IN) Received recv_frag descriptor.
158+
* @param num_segments (IN) Flag indicating wether a match was made.
159+
* @param type (IN) Type of the message header.
160+
* @return OMPI_SUCCESS or error status on failure.
161+
*/
162+
static int
163+
mca_pml_ob1_recv_frag_match_proc( mca_btl_base_module_t *btl,
164+
ompi_communicator_t* comm_ptr,
165+
mca_pml_ob1_comm_proc_t *proc,
166+
mca_pml_ob1_match_hdr_t *hdr,
167+
mca_btl_base_segment_t* segments,
168+
size_t num_segments,
169+
int type,
170+
mca_pml_ob1_recv_frag_t* frag );
100171

101172
static mca_pml_ob1_recv_request_t*
102173
match_one(mca_btl_base_module_t *btl,
@@ -105,6 +176,19 @@ match_one(mca_btl_base_module_t *btl,
105176
mca_pml_ob1_comm_proc_t *proc,
106177
mca_pml_ob1_recv_frag_t* frag);
107178

179+
static inline mca_pml_ob1_recv_frag_t* check_cantmatch_for_match(mca_pml_ob1_comm_proc_t *proc)
180+
{
181+
mca_pml_ob1_recv_frag_t *frag = NULL;
182+
183+
frag = (mca_pml_ob1_recv_frag_t*)opal_list_get_first(&proc->frags_cant_match);
184+
if( (opal_list_get_end(&proc->frags_cant_match) != (opal_list_item_t*)frag) &&
185+
(frag->hdr.hdr_match.hdr_seq == proc->expected_sequence) ) {
186+
opal_list_remove_item(&proc->frags_cant_match, (opal_list_item_t*)frag);
187+
return frag;
188+
}
189+
return NULL;
190+
}
191+
108192
void mca_pml_ob1_recv_frag_callback_match(mca_btl_base_module_t* btl,
109193
mca_btl_base_tag_t tag,
110194
mca_btl_base_descriptor_t* des,
@@ -164,15 +248,16 @@ void mca_pml_ob1_recv_frag_callback_match(mca_btl_base_module_t* btl,
164248
OB1_MATCHING_LOCK(&comm->matching_lock);
165249

166250
if (!OMPI_COMM_CHECK_ASSERT_ALLOW_OVERTAKE(comm_ptr)) {
167-
/* get sequence number of next message that can be processed */
168-
if(OPAL_UNLIKELY((((uint16_t) hdr->hdr_seq) != ((uint16_t) proc->expected_sequence)) ||
169-
(opal_list_get_size(&proc->frags_cant_match) > 0 ))) {
170-
goto slow_path;
171-
}
172-
173-
/* This is the sequence number we were expecting, so we can try
174-
* matching it to already posted receives.
251+
/* get sequence number of next message that can be processed.
252+
* If this frag is out of sequence, queue it up in the list
253+
* now as we still have the lock.
175254
*/
255+
if(OPAL_UNLIKELY(((uint16_t) hdr->hdr_seq) != ((uint16_t) proc->expected_sequence))) {
256+
append_frag_to_ordered_list(&proc->frags_cant_match, btl,
257+
hdr, segments, num_segments, NULL);
258+
OB1_MATCHING_UNLOCK(&comm->matching_lock);
259+
return;
260+
}
176261

177262
/* We're now expecting the next sequence number. */
178263
proc->expected_sequence++;
@@ -183,14 +268,13 @@ void mca_pml_ob1_recv_frag_callback_match(mca_btl_base_module_t* btl,
183268
* generation until we reach the correct sequence number.
184269
*/
185270
PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_SEARCH_POSTED_Q_BEGIN, comm_ptr,
186-
hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
271+
hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
187272

188273
match = match_one(btl, hdr, segments, num_segments, comm_ptr, proc, NULL);
189274

190275
/* The match is over. We generate the SEARCH_POSTED_Q_END here,
191-
* before going into the mca_pml_ob1_check_cantmatch_for_match so
192-
* we can make a difference for the searching time for all
193-
* messages.
276+
* before going into check_cantmatch_for_match so we can make
277+
* a difference for the searching time for all messages.
194278
*/
195279
PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_SEARCH_POSTED_Q_END, comm_ptr,
196280
hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
@@ -246,12 +330,31 @@ void mca_pml_ob1_recv_frag_callback_match(mca_btl_base_module_t* btl,
246330
/* don't need a rmb as that is for checking */
247331
recv_request_pml_complete(match);
248332
}
249-
return;
250333

251-
slow_path:
252-
OB1_MATCHING_UNLOCK(&comm->matching_lock);
253-
mca_pml_ob1_recv_frag_match(btl, hdr, segments,
254-
num_segments, MCA_PML_OB1_HDR_TYPE_MATCH);
334+
/* We matched the frag, Now see if we already have the next sequence in
335+
* our OOS list. If yes, try to match it.
336+
*
337+
* NOTE:
338+
* To optimize the number of lock used, mca_pml_ob1_recv_frag_match_proc()
339+
* MUST be called with communicator lock and will RELEASE the lock. This is
340+
* not ideal but it is better for the performance.
341+
*/
342+
if(0 != opal_list_get_size(&proc->frags_cant_match)) {
343+
mca_pml_ob1_recv_frag_t* frag;
344+
345+
OB1_MATCHING_LOCK(&comm->matching_lock);
346+
if((frag = check_cantmatch_for_match(proc))) {
347+
/* mca_pml_ob1_recv_frag_match_proc() will release the lock. */
348+
mca_pml_ob1_recv_frag_match_proc(frag->btl, comm_ptr, proc,
349+
&frag->hdr.hdr_match,
350+
frag->segments, frag->num_segments,
351+
hdr->hdr_common.hdr_type, frag);
352+
} else {
353+
OB1_MATCHING_UNLOCK(&comm->matching_lock);
354+
}
355+
}
356+
357+
return;
255358
}
256359

257360

@@ -590,31 +693,6 @@ match_one(mca_btl_base_module_t *btl,
590693
} while(true);
591694
}
592695

593-
static mca_pml_ob1_recv_frag_t* check_cantmatch_for_match(mca_pml_ob1_comm_proc_t *proc)
594-
{
595-
mca_pml_ob1_recv_frag_t *frag;
596-
597-
/* search the list for a fragment from the send with sequence
598-
* number next_msg_seq_expected
599-
*/
600-
for(frag = (mca_pml_ob1_recv_frag_t*)opal_list_get_first(&proc->frags_cant_match);
601-
frag != (mca_pml_ob1_recv_frag_t*)opal_list_get_end(&proc->frags_cant_match);
602-
frag = (mca_pml_ob1_recv_frag_t*)opal_list_get_next(frag))
603-
{
604-
mca_pml_ob1_match_hdr_t* hdr = &frag->hdr.hdr_match;
605-
/*
606-
* If the message has the next expected seq from that proc...
607-
*/
608-
if(hdr->hdr_seq != proc->expected_sequence)
609-
continue;
610-
611-
opal_list_remove_item(&proc->frags_cant_match, (opal_list_item_t*)frag);
612-
return frag;
613-
}
614-
615-
return NULL;
616-
}
617-
618696
/**
619697
* RCS/CTS receive side matching
620698
*
@@ -652,12 +730,11 @@ static int mca_pml_ob1_recv_frag_match( mca_btl_base_module_t *btl,
652730
int type)
653731
{
654732
/* local variables */
655-
uint16_t next_msg_seq_expected, frag_msg_seq;
733+
uint16_t frag_msg_seq;
734+
uint16_t next_msg_seq_expected;
656735
ompi_communicator_t *comm_ptr;
657-
mca_pml_ob1_recv_request_t *match = NULL;
658736
mca_pml_ob1_comm_t *comm;
659737
mca_pml_ob1_comm_proc_t *proc;
660-
mca_pml_ob1_recv_frag_t* frag = NULL;
661738

662739
/* communicator pointer */
663740
comm_ptr = ompi_comm_lookup(hdr->hdr_ctx);
@@ -676,14 +753,13 @@ static int mca_pml_ob1_recv_frag_match( mca_btl_base_module_t *btl,
676753
comm = (mca_pml_ob1_comm_t *)comm_ptr->c_pml_comm;
677754

678755
/* source sequence number */
679-
frag_msg_seq = hdr->hdr_seq;
680756
proc = mca_pml_ob1_peer_lookup (comm_ptr, hdr->hdr_src);
681757

682-
/**
683-
* We generate the MSG_ARRIVED event as soon as the PML is aware of a matching
684-
* fragment arrival. Independing if it is received on the correct order or not.
685-
* This will allow the tools to figure out if the messages are not received in the
686-
* correct order (if multiple network interfaces).
758+
/* We generate the MSG_ARRIVED event as soon as the PML is aware
759+
* of a matching fragment arrival. Independing if it is received
760+
* on the correct order or not. This will allow the tools to
761+
* figure out if the messages are not received in the correct
762+
* order (if multiple network interfaces).
687763
*/
688764
PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_MSG_ARRIVED, comm_ptr,
689765
hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
@@ -697,38 +773,67 @@ static int mca_pml_ob1_recv_frag_match( mca_btl_base_module_t *btl,
697773
*/
698774
OB1_MATCHING_LOCK(&comm->matching_lock);
699775

700-
/* get sequence number of next message that can be processed */
776+
frag_msg_seq = hdr->hdr_seq;
701777
next_msg_seq_expected = (uint16_t)proc->expected_sequence;
702-
if(OPAL_UNLIKELY(frag_msg_seq != next_msg_seq_expected))
703-
goto wrong_seq;
704778

705-
/*
706-
* This is the sequence number we were expecting,
707-
* so we can try matching it to already posted
708-
* receives.
779+
/* If the sequence number is wrong, queue it up for later. */
780+
if(OPAL_UNLIKELY(frag_msg_seq != next_msg_seq_expected)) {
781+
append_frag_to_ordered_list(&proc->frags_cant_match, btl, hdr, segments,
782+
num_segments, NULL);
783+
OB1_MATCHING_UNLOCK(&comm->matching_lock);
784+
return OMPI_SUCCESS;
785+
}
786+
787+
/* mca_pml_ob1_recv_frag_match_proc() will release the lock. */
788+
return mca_pml_ob1_recv_frag_match_proc(btl, comm_ptr, proc, hdr,
789+
segments, num_segments,
790+
type, NULL);
791+
}
792+
793+
794+
/* mca_pml_ob1_recv_frag_match_proc() will match the given frag and
795+
* then try to match the next frag in sequence by looking into arrived
796+
* out of order frags in frags_cant_match list until it can't find one.
797+
*
798+
* ATTENTION: THIS FUNCTION MUST BE CALLED WITH COMMUNICATOR LOCK HELD.
799+
* THE LOCK WILL BE RELEASED UPON RETURN. USE WITH CARE. */
800+
static int
801+
mca_pml_ob1_recv_frag_match_proc( mca_btl_base_module_t *btl,
802+
ompi_communicator_t* comm_ptr,
803+
mca_pml_ob1_comm_proc_t *proc,
804+
mca_pml_ob1_match_hdr_t *hdr,
805+
mca_btl_base_segment_t* segments,
806+
size_t num_segments,
807+
int type,
808+
mca_pml_ob1_recv_frag_t* frag )
809+
{
810+
/* local variables */
811+
mca_pml_ob1_comm_t* comm = (mca_pml_ob1_comm_t *)comm_ptr->c_pml_comm;
812+
mca_pml_ob1_recv_request_t *match = NULL;
813+
814+
/* If we are here, this is the sequence number we were expecting,
815+
* so we can try matching it to already posted receives.
709816
*/
710817

711-
out_of_order_match:
818+
match_this_frag:
712819
/* We're now expecting the next sequence number. */
713820
proc->expected_sequence++;
714821

715-
/**
716-
* We generate the SEARCH_POSTED_QUEUE only when the message is received
717-
* in the correct sequence. Otherwise, we delay the event generation until
718-
* we reach the correct sequence number.
822+
/* We generate the SEARCH_POSTED_QUEUE only when the message is
823+
* received in the correct sequence. Otherwise, we delay the event
824+
* generation until we reach the correct sequence number.
719825
*/
720826
PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_SEARCH_POSTED_Q_BEGIN, comm_ptr,
721-
hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
827+
hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
722828

723829
match = match_one(btl, hdr, segments, num_segments, comm_ptr, proc, frag);
724830

725-
/**
726-
* The match is over. We generate the SEARCH_POSTED_Q_END here, before going
727-
* into the mca_pml_ob1_check_cantmatch_for_match so we can make a difference
728-
* for the searching time for all messages.
831+
/* The match is over. We generate the SEARCH_POSTED_Q_END here,
832+
* before going into check_cantmatch_for_match we can make a
833+
* difference for the searching time for all messages.
729834
*/
730835
PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_SEARCH_POSTED_Q_END, comm_ptr,
731-
hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
836+
hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
732837

733838
/* release matching lock before processing fragment */
734839
OB1_MATCHING_UNLOCK(&comm->matching_lock);
@@ -752,7 +857,7 @@ static int mca_pml_ob1_recv_frag_match( mca_btl_base_module_t *btl,
752857

753858
/*
754859
* Now that new message has arrived, check to see if
755-
* any fragments on the c_c_frags_cant_match list
860+
* any fragments on the frags_cant_match list
756861
* may now be used to form new matchs
757862
*/
758863
if(OPAL_UNLIKELY(opal_list_get_size(&proc->frags_cant_match) > 0)) {
@@ -763,20 +868,11 @@ static int mca_pml_ob1_recv_frag_match( mca_btl_base_module_t *btl,
763868
num_segments = frag->num_segments;
764869
btl = frag->btl;
765870
type = hdr->hdr_common.hdr_type;
766-
goto out_of_order_match;
871+
goto match_this_frag;
767872
}
768873
OB1_MATCHING_UNLOCK(&comm->matching_lock);
769874
}
770875

771-
return OMPI_SUCCESS;
772-
wrong_seq:
773-
/*
774-
* This message comes after the next expected, so it
775-
* is ahead of sequence. Save it for later.
776-
*/
777-
append_frag_to_list(&proc->frags_cant_match, btl, hdr, segments,
778-
num_segments, NULL);
779-
OB1_MATCHING_UNLOCK(&comm->matching_lock);
780876
return OMPI_SUCCESS;
781877
}
782878

0 commit comments

Comments
 (0)