Pārlūkot izejas kodu

- tm 6xx fixes: when a 6xx is received send a CANCEL on all the branches for which a provisional response was received and then wait for the branches to
finish (timeout, 487 from the CANCEL or another final response which
"raced" the CANCEL). Branches that did not receive any response
(in fact any response >=100) will be "terminated" immediately (fake 487).
- tm which_cancel() 0-the cancel bitmap fix
- tm which_cancel(), should_cancel(), cancel_branch() enhancements/changes
- tm rpc_cancel fixes (it works now)

Andrei Pelinescu-Onciul 19 gadi atpakaļ
vecāks
revīzija
65e998d19c
8 mainītis faili ar 190 papildinājumiem un 55 dzēšanām
  1. 1 1
      Makefile.defs
  2. 6 1
      NEWS
  3. 3 1
      modules/tm/h_table.h
  4. 104 20
      modules/tm/t_cancel.c
  5. 24 6
      modules/tm/t_cancel.h
  6. 14 11
      modules/tm/t_fwd.c
  7. 37 12
      modules/tm/t_reply.c
  8. 1 3
      modules/tm/timer.c

+ 1 - 1
Makefile.defs

@@ -67,7 +67,7 @@ MAIN_NAME=ser
 VERSION = 0
 PATCHLEVEL = 10
 SUBLEVEL =   99
-EXTRAVERSION = -dev49-dns_cache
+EXTRAVERSION = -dev50-dns_cache
 
 SER_VER = $(shell expr $(VERSION) \* 1000000 + $(PATCHLEVEL) \* 1000 + \
 			$(SUBLEVEL) )

+ 6 - 1
NEWS

@@ -24,7 +24,12 @@ modules:
                 hashing after an uri (to, from or request uri)
               - improved uri hashing (password is ignored, port is used only
                 if != 5060 or 5061)
- - tm        - better final reply selection: 6xx is preferred over other 
+ - tm        - on 6xx immediately cancel all the branches for which a 
+               provisional response was received and wait for all the 
+               branches to finish (either timeout, the 487 from the CANCEL
+               or a final response still on the wire in the moment the
+               CANCEL was sent)
+             - better final reply selection: 6xx is preferred over other 
                negative replies; from several 4xx prefer 401, 407, 415, 420,
                484 (in this order). For all the other cases, return the lowest
                code (as before)

+ 3 - 1
modules/tm/h_table.h

@@ -190,8 +190,10 @@ struct totag_elem {
 #define T_NOISY_CTIMER_FLAG  (1<<2)
 /* transaction canceled */
 #define T_CANCELED           (1<<3)
+/* 6xx received => stop forking */
+#define T_6xx                (1<<4) 
 
-#define T_IN_AGONY (1<<4) /* set if waiting to die (delete timer)
+#define T_IN_AGONY (1<<5) /* set if waiting to die (delete timer)
                              TODO: replace it with del on unref */
 
 /* transaction context */

+ 104 - 20
modules/tm/t_cancel.c

@@ -31,6 +31,7 @@
  *             moved here (jiri)
  * 2004-02-11  FIFO/CANCEL + alignments (hash=f(callid,cseq)) (uli+jiri)
  * 2004-02-13  timer_link.payload removed (bogdan)
+ * 2006-10-10  cancel_uacs  & cancel_branch take more options now (andrei)
  */
 
 #include <stdio.h> /* for FILE* in fifo_uac_cancel */
@@ -53,13 +54,12 @@
    by two processes might concurrently try to generate
    a CANCEL for the third branch, resulting in race conditions
    during writing to cancel buffer
-*/
-
-
+ WARNING: - has side effects, see should_cancel_branch() */
 void which_cancel( struct cell *t, branch_bm_t *cancel_bm )
 {
 	int i;
-
+	
+	*cancel_bm=0;
 	for( i=0 ; i<t->nr_of_outgoings ; i++ ) {
 		if (should_cancel_branch(t, i)) 
 			*cancel_bm |= 1<<i ;
@@ -68,25 +68,67 @@ void which_cancel( struct cell *t, branch_bm_t *cancel_bm )
 }
 
 
-/* cancel branches scheduled for deletion */
-void cancel_uacs( struct cell *t, branch_bm_t cancel_bm )
+
+
+/* cancel branches scheduled for deletion
+ * params: t          - transaction
+ *          cancel_bm - bitmap with the branches that are supposed to be 
+ *                       canceled 
+ *          flags     - how_to_cancel flags, see cancel_branch()
+ * returns: bitmap with the still active branches (on fr timer) */
+int cancel_uacs( struct cell *t, branch_bm_t cancel_bm, int flags)
 {
 	int i;
+	int ret;
+	int r;
 
+	ret=0;
 	/* cancel pending client transactions, if any */
 	for( i=0 ; i<t->nr_of_outgoings ; i++ ) 
-		if (cancel_bm & (1<<i))
-           	cancel_branch(t, i);
+		if (cancel_bm & (1<<i)){
+			r=cancel_branch(t, i, flags);
+			ret|=(r!=0)<<i;
+		}
+	return ret;
 }
 
-void cancel_branch( struct cell *t, int branch )
+
+
+/* 
+ * params:  t - transaction
+ *          branch - branch number to be canceled
+ *          flags - howto cancel: 
+ *                   F_CANCEL_B_KILL - will completely stop the 
+ *                     branch (stops the timers), use with care
+ *                   F_CANCEL_B_FAKE_REPLY - will send a fake 487
+ *                      to all branches that haven't received any response
+ *                      (>=100). It assumes the REPLY_LOCK is not held
+ *                      (if it is => deadlock)
+ *                  default: stop only the retransmissions for the branch
+ *                      and leave it to timeout if it doesn't receive any
+ *                      response to the CANCEL
+ * returns: 0 - branch inactive after running cancel_branch() 
+ *          1 - branch still active  (fr_timer)
+ *         -1 - error
+ * WARNING:
+ *          - F_CANCEL_KILL_B should be used only if the transaction is killed
+ *            explicitely afterwards (since it might kill all the timers
+ *            the transaction won't be able to "kill" itself => if not
+ *            explicitely "put_on_wait" it migh leave forever)
+ *          - F_CANCEL_B_FAKE_REPLY must be used only if the REPLY_LOCK is not
+ *            held
+ */
+int cancel_branch( struct cell *t, int branch, int flags )
 {
 	char *cancel;
 	unsigned int len;
 	struct retr_buf *crb, *irb;
+	int ret;
+	branch_bm_t tmp_bm;
 
 	crb=&t->uac[branch].local_cancel;
 	irb=&t->uac[branch].request;
+	ret=1;
 
 #	ifdef EXTRA_DEBUG
 	if (crb->buffer!=0 && crb->buffer!=BUSY_BUFFER) {
@@ -95,25 +137,52 @@ void cancel_branch( struct cell *t, int branch )
 	}
 #	endif
 
-	stop_rb_timers( irb );
-
-	if (t->uac[branch].last_received < 100) {
-		DBG("DEBUG: cancel_branch: no response ever received: "
-		    "giving up on cancel\n");
-		return;
+	if (flags & F_CANCEL_B_KILL){
+		stop_rb_timers( irb );
+		ret=0;
+		if (t->uac[branch].last_received < 100) {
+			DBG("DEBUG: cancel_branch: no response ever received: "
+			    "giving up on cancel\n");
+			if (flags & F_CANCEL_B_FAKE_REPLY){
+				LOCK_REPLIES(t);
+				if (relay_reply(t, FAKED_REPLY, branch, 487, &tmp_bm) == 
+										RPS_ERROR){
+					return -1;
+				}
+			}
+			/* do nothing, hope that the caller will clean up */
+			return ret;
+		}
+	}else{
+		stop_rb_retr(irb); /* stop retransmissions */
+		if (t->uac[branch].last_received < 100) {
+			/* no response received => don't send a cancel on this branch,
+			 *  just drop it */
+			if (flags & F_CANCEL_B_FAKE_REPLY){
+				stop_rb_timers( irb ); /* stop even the fr timer */
+				LOCK_REPLIES(t);
+				if (relay_reply(t, FAKED_REPLY, branch, 487, &tmp_bm) == 
+										RPS_ERROR){
+					return -1;
+				}
+				return 0; /* should be inactive after the 487 */
+			}
+			/* do nothing, just wait for the final timeout */
+			return 1;
+		}
 	}
 	
 	cancel = build_local(t, branch, &len, CANCEL, CANCEL_LEN, &t->to);
 	if (!cancel) {
 		LOG(L_ERR, "ERROR: attempt to build a CANCEL failed\n");
-		return;
+		return -1;
 	}
 	/* install cancel now */
 	crb->buffer = cancel;
 	crb->buffer_len = len;
 	crb->dst = irb->dst;
 	crb->branch = branch;
-	/* label it as cancel so that FR timer can better now how to
+	/* label it as cancel so that FR timer can better know how to
 	   deal with it */
 	crb->activ_type = TYPE_LOCAL_CANCEL;
 
@@ -123,6 +192,7 @@ void cancel_branch( struct cell *t, int branch )
 	if (start_retr( crb )!=0)
 		LOG(L_CRIT, "BUG: cancel_branch: failed to start retransmission"
 					" for %p\n", crb);
+	return ret;
 }
 
 
@@ -143,14 +213,17 @@ void rpc_cancel(rpc_t* rpc, void* c)
 {
 	struct cell *trans;
 	static char cseq[128], callid[128];
+	branch_bm_t cancel_bm;
+	int i,j;
 
 	str cseq_s;   /* cseq */
 	str callid_s; /* callid */
 
 	cseq_s.s=cseq;
 	callid_s.s=callid;
+	cancel_bm=0;
 
-	if (rpc->scan(rpc, "SS", &callid_s, &cseq_s) < 2) {
+	if (rpc->scan(c, "SS", &callid_s, &cseq_s) < 2) {
 		rpc->fault(c, 400, "Callid and CSeq expected as parameters");
 		return;
 	}
@@ -160,10 +233,21 @@ void rpc_cancel(rpc_t* rpc, void* c)
 		rpc->fault(c, 400, "Transaction not found");
 		return;
 	}
-	     /* tell tm to cancel the call */
+	/*  find the branches that need cancel-ing */
+	LOCK_REPLIES(trans);
+		which_cancel(trans, &cancel_bm);
+	UNLOCK_REPLIES(trans);
+	 /* tell tm to cancel the call */
 	DBG("Now calling cancel_uacs\n");
-	(*cancel_uacs)(trans, ~0);
+	i=cancel_uacs(trans, cancel_bm, 0); /* don't fake 487s, 
+										 just wait for timeout */
 	
 	     /* t_lookup_callid REF`d the transaction for us, we must UNREF here! */
 	UNREF(trans);
+	j=0;
+	while(i){
+		j++;
+		i&=i-1;
+	}
+	rpc->add(c, "ds", j, "branches remaining (waiting for timeout)");
 }

+ 24 - 6
modules/tm/t_cancel.h

@@ -28,6 +28,8 @@
  * History:
  * ---------
  *  2004-02-11  FIFO/CANCEL + alignments (hash=f(callid,cseq)) (uli+jiri)
+ *  2006-10-10  should_cancel_branch() returns true even for branches with
+ *               no response or with response <100 (andrei)
  */
 
 
@@ -51,20 +53,36 @@
 
 #define BUSY_BUFFER ((char *)-1)
 
+/* flags for cancel_uacs(), cancel_branch() */
+#define F_CANCEL_B_KILL 1  /*  will completely stop the  branch (stops the
+							   timers), use only before a put_on_wait()
+							   or set_final_timer()*/
+#define F_CANCEL_B_FAKE_REPLY 2 /* will send a fake 487 to all branches that
+								 haven't received any response (>=100). It
+								 assumes the REPLY_LOCK is not held (if it is
+								 => deadlock) */
+
+
 void which_cancel( struct cell *t, branch_bm_t *cancel_bm );
-void cancel_uacs( struct cell *t, branch_bm_t cancel_bm );
-void cancel_branch( struct cell *t, int branch );
+int cancel_uacs( struct cell *t, branch_bm_t cancel_bm, int flags );
+int cancel_branch( struct cell *t, int branch, int flags );
+
+
 
+/* should be called either with the REPLY_LOCK held or if its known
+ *  by other means that no other cancel trigerring reply can appear
+ *  between this call and the call to cancel_uacs()/cancel_branch()
+ *  WARNING: has side effects: marks branches that should be canceled
+ *   and a second call won't return them again */
 inline short static should_cancel_branch( struct cell *t, int b )
 {
 	int last_received;
 	short should;
 
 	last_received=t->uac[b].last_received;
-	/* cancel only if provisional received and noone else
-	   attempted to cancel yet */
-	should=last_received>=100 && last_received<200
-		&& t->uac[b].local_cancel.buffer==0;
+	/* cancel even if no reply received (in this case cancel_branch()
+	 *  won't actually send the cancel but it will do the cleanup) */
+	should=last_received<200 && t->uac[b].local_cancel.buffer==0;
 	/* we'll cancel -- label it so that noone else
 		(e.g. another 200 branch) will try to do the same */
 	if (should) t->uac[b].local_cancel.buffer=BUSY_BUFFER;

+ 14 - 11
modules/tm/t_fwd.c

@@ -56,6 +56,8 @@
  *               if error is returned) (andrei)
  *  2006-09-15  e2e_cancel uses t_reply_unsafe when called from the 
  *               failure_route and replying to a cancel (andrei)
+ *  2006-10-10  e2e_cancel update for the new/modified 
+ *               which_cancel()/should_cancel() (andrei)
  */
 
 #include "defs.h"
@@ -477,7 +479,7 @@ void e2e_cancel( struct sip_msg *cancel_msg,
 	t_cancel->label=t_invite->label;
 	/* ... and install CANCEL UACs */
 	for (i=0; i<t_invite->nr_of_outgoings; i++)
-		if (cancel_bm & (1<<i)) {
+		if ((cancel_bm & (1<<i)) && (t_invite->uac[i].last_received>=100)) {
 			ret=e2e_cancel_branch(cancel_msg, t_cancel, t_invite, i);
 			if (ret<0) cancel_bm &= ~(1<<i);
 			if (ret<lowest_error) lowest_error=ret;
@@ -489,16 +491,17 @@ void e2e_cancel( struct sip_msg *cancel_msg,
 	/* send them out */
 	for (i = 0; i < t_cancel->nr_of_outgoings; i++) {
 		if (cancel_bm & (1 << i)) {
-			     /* Provisional reply received on this branch, send CANCEL */
-			     /* No need to stop timers as they have already been stopped by the reply */
-			if (SEND_BUFFER(&t_cancel->uac[i].request) == -1) {
-				LOG(L_ERR, "ERROR: e2e_cancel: send failed\n");
-			}
-			if (start_retr( &t_cancel->uac[i].request )!=0)
-				LOG(L_CRIT, "BUG: e2e_cancel: failed to start retr. for %p\n",
-							&t_cancel->uac[i].request);
-		} else {
-			if (t_invite->uac[i].last_received < 100) {
+			if (t_invite->uac[i].last_received>=100){
+				/* Provisional reply received on this branch, send CANCEL */
+				/* No need to stop timers as they have already been stopped 
+				 * by the reply */
+				if (SEND_BUFFER(&t_cancel->uac[i].request) == -1) {
+					LOG(L_ERR, "ERROR: e2e_cancel: send failed\n");
+				}
+				if (start_retr( &t_cancel->uac[i].request )!=0)
+					LOG(L_CRIT, "BUG: e2e_cancel: failed to start retr."
+							" for %p\n", &t_cancel->uac[i].request);
+			} else {
 				/* No provisional response received, stop
 				 * retransmission timers */
 				stop_rb_retr(&t_invite->uac[i].request);

+ 37 - 12
modules/tm/t_reply.c

@@ -406,7 +406,6 @@ static int _reply_light( struct cell *trans, char* buf, unsigned int len,
 		goto error2;
 	}
 
-
 	rb = & trans->uas.response;
 	rb->activ_type=code;
 
@@ -445,7 +444,8 @@ static int _reply_light( struct cell *trans, char* buf, unsigned int len,
 		}
 
 		cleanup_uac_timers( trans );
-		if (is_invite(trans)) cancel_uacs( trans, cancel_bitmap );
+		if (is_invite(trans)) 
+			cancel_uacs( trans, cancel_bitmap, F_CANCEL_B_KILL );
 		set_final_timer(  trans );
 	}
 
@@ -478,7 +478,8 @@ error2:
 error:
 	/* do UAC cleanup */
 	cleanup_uac_timers( trans );
-	if ( is_invite(trans) ) cancel_uacs( trans, cancel_bitmap );
+	if ( is_invite(trans) )
+		cancel_uacs( trans, cancel_bitmap, F_CANCEL_B_KILL);
 	/* we did not succeed -- put the transaction on wait */
 	put_on_wait(trans);
 	return -1;
@@ -859,11 +860,13 @@ static enum rps t_should_relay_response( struct cell *Trans , int new_code,
 			DBG("DEBUG: final reply retransmission\n");
 			goto discard;
 		}
-		/* if you FR-timed-out, faked a local 408 or 480 and 487 came, don't
+		/* if you FR-timed-out, faked a local 408  and 487 came or
+		 *  faked a CANCEL on a non-replied branch don't
 		 * report on it either */
-		if ((Trans->uac[branch].last_received==408 ||
-				Trans->uac[branch].last_received==480) && new_code==487) {
-			DBG("DEBUG: 487 came for a timed-out branch\n");
+		if ((Trans->uac[branch].last_received==487) || 
+				(Trans->uac[branch].last_received==408 && new_code==487)) {
+			DBG("DEBUG: %d came for a %d branch (ignored)\n",
+					new_code, Trans->uac[branch].last_received);
 			goto discard;
 		}
 		/* this looks however how a very strange status rewrite attempt;
@@ -886,6 +889,13 @@ static enum rps t_should_relay_response( struct cell *Trans , int new_code,
 		if (picked_branch==-2) { /* branches open yet */
 			*should_store=1;
 			*should_relay=-1;
+			if (new_code>=600 && new_code<=699){
+				if (!(Trans->flags & T_6xx)){
+					/* cancel only the first time we get a 6xx */
+					which_cancel(Trans, cancel_bitmap);
+					Trans->flags|=T_6xx;
+				}
+			}
 			return RPS_STORE;
 		}
 		if (picked_branch==-1) {
@@ -901,6 +911,8 @@ static enum rps t_should_relay_response( struct cell *Trans , int new_code,
 		 * make it available in failure routes - a kind of "fake"
 		 * save of the final reply per branch */
 		Trans->uac[branch].reply = reply;
+		Trans->flags&=~T_6xx; /* clear the 6xx flag , we want to 
+								 allow new branches from the failure route */
 
 		/* run ON_FAILURE handlers ( route and callbacks) */
 		if ( has_tran_tmcbs( Trans, TMCB_ON_FAILURE_RO|TMCB_ON_FAILURE)
@@ -1282,7 +1294,7 @@ error02:
 error01:
 	t_reply_unsafe( t, t->uas.request, 500, "Reply processing error" );
 	UNLOCK_REPLIES(t);
-	if (is_invite(t)) cancel_uacs( t, *cancel_bitmap );
+	if (is_invite(t)) cancel_uacs( t, *cancel_bitmap, 0);
 	/* a serious error occurred -- attempt to send an error reply;
 	   it will take care of clean-ups  */
 
@@ -1364,7 +1376,7 @@ error:
 	cleanup_uac_timers(t);
 	if ( get_cseq(p_msg)->method.len==INVITE_LEN
 		&& memcmp( get_cseq(p_msg)->method.s, INVITE, INVITE_LEN)==0)
-		cancel_uacs( t, *cancel_bitmap );
+		cancel_uacs( t, *cancel_bitmap, F_CANCEL_B_KILL);
 	put_on_wait(t);
 	return RPS_ERROR;
 }
@@ -1493,10 +1505,15 @@ int reply_received( struct sip_msg  *p_msg )
 			      * be still pending branches ...
 			      */
 			cleanup_uac_timers( t );
-			if (is_invite(t)) cancel_uacs( t, cancel_bitmap );
+			if (is_invite(t)) cancel_uacs(t, cancel_bitmap, F_CANCEL_B_KILL);
 			/* There is no need to call set_final_timer because we know
->--->--->--- * that the transaction is local */
+			 * that the transaction is local */
 			put_on_wait(t);
+		}else if (cancel_bitmap){
+			/* cancel everything, even non-INVITEs (e.g in case of 6xx), wait
+			 * for the cancel replies if a provisional response was received
+			 *  or generate a fake reply if not */
+			cancel_uacs(t, cancel_bitmap, F_CANCEL_B_FAKE_REPLY);
 		}
 	} else {
 		reply_status=relay_reply( t, p_msg, branch, msg_status,
@@ -1506,12 +1523,20 @@ int reply_received( struct sip_msg  *p_msg )
 				be still pending branches ...
 			     */
 			cleanup_uac_timers( t );
-			if (is_invite(t)) cancel_uacs( t, cancel_bitmap );
+			/* 2xx is a special case: we can have a COMPLETED request
+			 * with branches still open => we have to cancel them */
+			if (is_invite(t) && cancel_bitmap) 
+				cancel_uacs( t, cancel_bitmap,  F_CANCEL_B_KILL);
 			/* FR for negative INVITES, WAIT anything else */
 			/* Call to set_final_timer is embedded in relay_reply to avoid
 			 * race conditions when reply is sent out and an ACK to stop
 			 * retransmissions comes before retransmission timer is set.*/
 			/* set_final_timer(t) */
+		}else if (cancel_bitmap){
+			/* cancel everything, even non-INVITEs (e.g in case of 6xx), wait
+			 * for the cancel replies if a provisional response was received
+			 *  or generate a fake reply if not */
+			cancel_uacs(t, cancel_bitmap, F_CANCEL_B_FAKE_REPLY);
 		}
 	}
 	uac->request.flags|=F_RB_REPLIED;

+ 1 - 3
modules/tm/timer.c

@@ -243,8 +243,6 @@ static void fake_reply(struct cell *t, int branch, int code )
 	enum rps reply_status;
 
 	do_cancel_branch = is_invite(t) && should_cancel_branch(t, branch);
-
-	cancel_bitmap=do_cancel_branch ? 1<<branch : 0;
 	if ( is_local(t) ) {
 		reply_status=local_reply( t, FAKED_REPLY, branch, 
 					  code, &cancel_bitmap );
@@ -276,7 +274,7 @@ static void fake_reply(struct cell *t, int branch, int code )
 
 	}
 	/* now when out-of-lock do the cancel I/O */
-	if (do_cancel_branch) cancel_branch(t, branch );
+	if (do_cancel_branch) cancel_branch(t, branch, 0);
 	/* it's cleaned up on error; if no error occurred and transaction
 	   completed regularly, I have to clean-up myself
 	*/