From 5eccdf5e06eb67779716ae26142402a1ae9b012c Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Mon, 21 Nov 2011 06:53:46 +0000 Subject: tc: comment spelling fixes Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/pkt_sched.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux/pkt_sched.h') diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h index c5336705921f..7281d5acf2f9 100644 --- a/include/linux/pkt_sched.h +++ b/include/linux/pkt_sched.h @@ -30,7 +30,7 @@ */ struct tc_stats { - __u64 bytes; /* NUmber of enqueues bytes */ + __u64 bytes; /* Number of enqueued bytes */ __u32 packets; /* Number of enqueued packets */ __u32 drops; /* Packets dropped because of lack of resources */ __u32 overlimits; /* Number of throttle events when this @@ -297,7 +297,7 @@ struct tc_htb_glob { __u32 debug; /* debug flags */ /* stats */ - __u32 direct_pkts; /* count of non shapped packets */ + __u32 direct_pkts; /* count of non shaped packets */ }; enum { TCA_HTB_UNSPEC, @@ -503,7 +503,7 @@ enum { }; #define NETEM_LOSS_MAX (__NETEM_LOSS_MAX - 1) -/* State transition probablities for 4 state model */ +/* State transition probabilities for 4 state model */ struct tc_netem_gimodel { __u32 p13; __u32 p31; -- cgit v1.2.3 From 7bc0f28c7a0cd19f40e5a6e4d0a117db9a4e4cd5 Mon Sep 17 00:00:00 2001 From: Hagen Paul Pfeifer Date: Wed, 30 Nov 2011 12:20:26 +0000 Subject: netem: rate extension Currently netem is not in the ability to emulate channel bandwidth. Only static delay (and optional random jitter) can be configured. To emulate the channel rate the token bucket filter (sch_tbf) can be used. But TBF has some major emulation flaws. The buffer (token bucket depth/rate) cannot be 0. Also the idea behind TBF is that the credit (token in buckets) fills if no packet is transmitted. So that there is always a "positive" credit for new packets. In real life this behavior contradicts the law of nature where nothing can travel faster as speed of light. E.g.: on an emulated 1000 byte/s link a small IPv4/TCP SYN packet with ~50 byte require ~0.05 seconds - not 0 seconds. Netem is an excellent place to implement a rate limiting feature: static delay is already implemented, tfifo already has time information and the user can skip TBF configuration completely. This patch implement rate feature which can be configured via tc. e.g: tc qdisc add dev eth0 root netem rate 10kbit To emulate a link of 5000byte/s and add an additional static delay of 10ms: tc qdisc add dev eth0 root netem delay 10ms rate 5KBps Note: similar to TBF the rate extension is bounded to the kernel timing system. Depending on the architecture timer granularity, higher rates (e.g. 10mbit/s and higher) tend to transmission bursts. Also note: further queues living in network adaptors; see ethtool(8). Signed-off-by: Hagen Paul Pfeifer Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/pkt_sched.h | 5 +++++ net/sched/sch_netem.c | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) (limited to 'include/linux/pkt_sched.h') diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h index 7281d5acf2f9..fb556dc594d3 100644 --- a/include/linux/pkt_sched.h +++ b/include/linux/pkt_sched.h @@ -465,6 +465,7 @@ enum { TCA_NETEM_REORDER, TCA_NETEM_CORRUPT, TCA_NETEM_LOSS, + TCA_NETEM_RATE, __TCA_NETEM_MAX, }; @@ -495,6 +496,10 @@ struct tc_netem_corrupt { __u32 correlation; }; +struct tc_netem_rate { + __u32 rate; /* byte/s */ +}; + enum { NETEM_LOSS_UNSPEC, NETEM_LOSS_GI, /* General Intuitive - 4 state model */ diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index eb3b9a86c6ed..9b7af9f1272f 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -79,6 +79,7 @@ struct netem_sched_data { u32 duplicate; u32 reorder; u32 corrupt; + u32 rate; struct crndstate { u32 last; @@ -298,6 +299,11 @@ static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma, return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu; } +static psched_time_t packet_len_2_sched_time(unsigned int len, u32 rate) +{ + return PSCHED_NS2TICKS((u64)len * NSEC_PER_SEC / rate); +} + /* * Insert one skb into qdisc. * Note: parent depends on return value to account for queue length. @@ -371,6 +377,24 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) &q->delay_cor, q->delay_dist); now = psched_get_time(); + + if (q->rate) { + struct sk_buff_head *list = &q->qdisc->q; + + delay += packet_len_2_sched_time(skb->len, q->rate); + + if (!skb_queue_empty(list)) { + /* + * Last packet in queue is reference point (now). + * First packet in queue is already in flight, + * calculate this time bonus and substract + * from delay. + */ + delay -= now - netem_skb_cb(skb_peek(list))->time_to_send; + now = netem_skb_cb(skb_peek_tail(list))->time_to_send; + } + } + cb->time_to_send = now + delay; ++q->counter; ret = qdisc_enqueue(skb, q->qdisc); @@ -535,6 +559,14 @@ static void get_corrupt(struct Qdisc *sch, const struct nlattr *attr) init_crandom(&q->corrupt_cor, r->correlation); } +static void get_rate(struct Qdisc *sch, const struct nlattr *attr) +{ + struct netem_sched_data *q = qdisc_priv(sch); + const struct tc_netem_rate *r = nla_data(attr); + + q->rate = r->rate; +} + static int get_loss_clg(struct Qdisc *sch, const struct nlattr *attr) { struct netem_sched_data *q = qdisc_priv(sch); @@ -594,6 +626,7 @@ static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = { [TCA_NETEM_CORR] = { .len = sizeof(struct tc_netem_corr) }, [TCA_NETEM_REORDER] = { .len = sizeof(struct tc_netem_reorder) }, [TCA_NETEM_CORRUPT] = { .len = sizeof(struct tc_netem_corrupt) }, + [TCA_NETEM_RATE] = { .len = sizeof(struct tc_netem_rate) }, [TCA_NETEM_LOSS] = { .type = NLA_NESTED }, }; @@ -666,6 +699,9 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt) if (tb[TCA_NETEM_CORRUPT]) get_corrupt(sch, tb[TCA_NETEM_CORRUPT]); + if (tb[TCA_NETEM_RATE]) + get_rate(sch, tb[TCA_NETEM_RATE]); + q->loss_model = CLG_RANDOM; if (tb[TCA_NETEM_LOSS]) ret = get_loss_clg(sch, tb[TCA_NETEM_LOSS]); @@ -846,6 +882,7 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) struct tc_netem_corr cor; struct tc_netem_reorder reorder; struct tc_netem_corrupt corrupt; + struct tc_netem_rate rate; qopt.latency = q->latency; qopt.jitter = q->jitter; @@ -868,6 +905,9 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) corrupt.correlation = q->corrupt_cor.rho; NLA_PUT(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt); + rate.rate = q->rate; + NLA_PUT(skb, TCA_NETEM_RATE, sizeof(rate), &rate); + if (dump_loss_model(q, skb) != 0) goto nla_put_failure; -- cgit v1.2.3 From 8af2a218de38f51ea4b4fa48cac1273319ae260c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 8 Dec 2011 06:06:03 +0000 Subject: sch_red: Adaptative RED AQM Adaptative RED AQM for linux, based on paper from Sally FLoyd, Ramakrishna Gummadi, and Scott Shenker, August 2001 : http://icir.org/floyd/papers/adaptiveRed.pdf Goal of Adaptative RED is to make max_p a dynamic value between 1% and 50% to reach the target average queue : (max_th - min_th) / 2 Every 500 ms: if (avg > target and max_p <= 0.5) increase max_p : max_p += alpha; else if (avg < target and max_p >= 0.01) decrease max_p : max_p *= beta; target :[min_th + 0.4*(min_th - max_th), min_th + 0.6*(min_th - max_th)]. alpha : min(0.01, max_p / 4) beta : 0.9 max_P is a Q0.32 fixed point number (unsigned, with 32 bits mantissa) Changes against our RED implementation are : max_p is no longer a negative power of two (1/(2^Plog)), but a Q0.32 fixed point number, to allow full range described in Adatative paper. To deliver a random number, we now use a reciprocal divide (thats really a multiply), but this operation is done once per marked/droped packet when in RED_BETWEEN_TRESH window, so added cost (compared to previous AND operation) is near zero. dump operation gives current max_p value in a new TCA_RED_MAX_P attribute. Example on a 10Mbit link : tc qdisc add dev $DEV parent 1:1 handle 10: est 1sec 8sec red \ limit 400000 min 30000 max 90000 avpkt 1000 \ burst 55 ecn adaptative bandwidth 10Mbit # tc -s -d qdisc show dev eth3 ... qdisc red 10: parent 1:1 limit 400000b min 30000b max 90000b ecn adaptative ewma 5 max_p=0.113335 Scell_log 15 Sent 50414282 bytes 34504 pkt (dropped 35, overlimits 1392 requeues 0) rate 9749Kbit 831pps backlog 72056b 16p requeues 0 marked 1357 early 35 pdrop 0 other 0 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/pkt_sched.h | 6 ++- include/net/red.h | 101 ++++++++++++++++++++++++++++++++++++++-------- lib/reciprocal_div.c | 2 + net/sched/sch_red.c | 21 ++++++++++ 4 files changed, 111 insertions(+), 19 deletions(-) (limited to 'include/linux/pkt_sched.h') diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h index fb556dc594d3..e41e0d4de24b 100644 --- a/include/linux/pkt_sched.h +++ b/include/linux/pkt_sched.h @@ -181,6 +181,7 @@ enum { TCA_RED_UNSPEC, TCA_RED_PARMS, TCA_RED_STAB, + TCA_RED_MAX_P, __TCA_RED_MAX, }; @@ -194,8 +195,9 @@ struct tc_red_qopt { unsigned char Plog; /* log(P_max/(qth_max-qth_min)) */ unsigned char Scell_log; /* cell size for idle damping */ unsigned char flags; -#define TC_RED_ECN 1 -#define TC_RED_HARDDROP 2 +#define TC_RED_ECN 1 +#define TC_RED_HARDDROP 2 +#define TC_RED_ADAPTATIVE 4 }; struct tc_red_xstats { diff --git a/include/net/red.h b/include/net/red.h index b72a3b833936..24606b22d01e 100644 --- a/include/net/red.h +++ b/include/net/red.h @@ -5,6 +5,7 @@ #include #include #include +#include /* Random Early Detection (RED) algorithm. ======================================= @@ -87,6 +88,29 @@ etc. */ +/* + * Adaptative RED : An Algorithm for Increasing the Robustness of RED's AQM + * (Sally FLoyd, Ramakrishna Gummadi, and Scott Shenker) August 2001 + * + * Every 500 ms: + * if (avg > target and max_p <= 0.5) + * increase max_p : max_p += alpha; + * else if (avg < target and max_p >= 0.01) + * decrease max_p : max_p *= beta; + * + * target :[qth_min + 0.4*(qth_min - qth_max), + * qth_min + 0.6*(qth_min - qth_max)]. + * alpha : min(0.01, max_p / 4) + * beta : 0.9 + * max_P is a Q0.32 fixed point number (with 32 bits mantissa) + * max_P between 0.01 and 0.5 (1% - 50%) [ Its no longer a negative power of two ] + */ +#define RED_ONE_PERCENT ((u32)DIV_ROUND_CLOSEST(1ULL<<32, 100)) + +#define MAX_P_MIN (1 * RED_ONE_PERCENT) +#define MAX_P_MAX (50 * RED_ONE_PERCENT) +#define MAX_P_ALPHA(val) min(MAX_P_MIN, val / 4) + #define RED_STAB_SIZE 256 #define RED_STAB_MASK (RED_STAB_SIZE - 1) @@ -101,10 +125,14 @@ struct red_stats { struct red_parms { /* Parameters */ - u32 qth_min; /* Min avg length threshold: A scaled */ - u32 qth_max; /* Max avg length threshold: A scaled */ + u32 qth_min; /* Min avg length threshold: Wlog scaled */ + u32 qth_max; /* Max avg length threshold: Wlog scaled */ u32 Scell_max; - u32 Rmask; /* Cached random mask, see red_rmask */ + u32 max_P; /* probability, [0 .. 1.0] 32 scaled */ + u32 max_P_reciprocal; /* reciprocal_value(max_P / qth_delta) */ + u32 qth_delta; /* max_th - min_th */ + u32 target_min; /* min_th + 0.4*(max_th - min_th) */ + u32 target_max; /* min_th + 0.6*(max_th - min_th) */ u8 Scell_log; u8 Wlog; /* log(W) */ u8 Plog; /* random number bits */ @@ -115,19 +143,22 @@ struct red_parms { number generation */ u32 qR; /* Cached random number */ - unsigned long qavg; /* Average queue length: A scaled */ + unsigned long qavg; /* Average queue length: Wlog scaled */ ktime_t qidlestart; /* Start of current idle period */ }; -static inline u32 red_rmask(u8 Plog) +static inline u32 red_maxp(u8 Plog) { - return Plog < 32 ? ((1 << Plog) - 1) : ~0UL; + return Plog < 32 ? (~0U >> Plog) : ~0U; } + static inline void red_set_parms(struct red_parms *p, u32 qth_min, u32 qth_max, u8 Wlog, u8 Plog, u8 Scell_log, u8 *stab) { + int delta = qth_max - qth_min; + /* Reset average queue length, the value is strictly bound * to the parameters below, reseting hurts a bit but leaving * it might result in an unreasonable qavg for a while. --TGR @@ -139,14 +170,29 @@ static inline void red_set_parms(struct red_parms *p, p->qth_max = qth_max << Wlog; p->Wlog = Wlog; p->Plog = Plog; - p->Rmask = red_rmask(Plog); + if (delta < 0) + delta = 1; + p->qth_delta = delta; + p->max_P = red_maxp(Plog); + p->max_P *= delta; /* max_P = (qth_max-qth_min)/2^Plog */ + + p->max_P_reciprocal = reciprocal_value(p->max_P / delta); + + /* RED Adaptative target : + * [min_th + 0.4*(min_th - max_th), + * min_th + 0.6*(min_th - max_th)]. + */ + delta /= 5; + p->target_min = qth_min + 2*delta; + p->target_max = qth_min + 3*delta; + p->Scell_log = Scell_log; p->Scell_max = (255 << Scell_log); memcpy(p->Stab, stab, sizeof(p->Stab)); } -static inline int red_is_idling(struct red_parms *p) +static inline int red_is_idling(const struct red_parms *p) { return p->qidlestart.tv64 != 0; } @@ -168,7 +214,7 @@ static inline void red_restart(struct red_parms *p) p->qcount = -1; } -static inline unsigned long red_calc_qavg_from_idle_time(struct red_parms *p) +static inline unsigned long red_calc_qavg_from_idle_time(const struct red_parms *p) { s64 delta = ktime_us_delta(ktime_get(), p->qidlestart); long us_idle = min_t(s64, delta, p->Scell_max); @@ -215,7 +261,7 @@ static inline unsigned long red_calc_qavg_from_idle_time(struct red_parms *p) } } -static inline unsigned long red_calc_qavg_no_idle_time(struct red_parms *p, +static inline unsigned long red_calc_qavg_no_idle_time(const struct red_parms *p, unsigned int backlog) { /* @@ -230,7 +276,7 @@ static inline unsigned long red_calc_qavg_no_idle_time(struct red_parms *p, return p->qavg + (backlog - (p->qavg >> p->Wlog)); } -static inline unsigned long red_calc_qavg(struct red_parms *p, +static inline unsigned long red_calc_qavg(const struct red_parms *p, unsigned int backlog) { if (!red_is_idling(p)) @@ -239,23 +285,24 @@ static inline unsigned long red_calc_qavg(struct red_parms *p, return red_calc_qavg_from_idle_time(p); } -static inline u32 red_random(struct red_parms *p) + +static inline u32 red_random(const struct red_parms *p) { - return net_random() & p->Rmask; + return reciprocal_divide(net_random(), p->max_P_reciprocal); } -static inline int red_mark_probability(struct red_parms *p, unsigned long qavg) +static inline int red_mark_probability(const struct red_parms *p, unsigned long qavg) { /* The formula used below causes questions. - OK. qR is random number in the interval 0..Rmask + OK. qR is random number in the interval + (0..1/max_P)*(qth_max-qth_min) i.e. 0..(2^Plog). If we used floating point arithmetics, it would be: (2^Plog)*rnd_num, where rnd_num is less 1. Taking into account, that qavg have fixed - point at Wlog, and Plog is related to max_P by - max_P = (qth_max-qth_min)/2^Plog; two lines + point at Wlog, two lines below have the following floating point equivalent: max_P*(qavg - qth_min)/(qth_max-qth_min) < rnd/qcount @@ -315,4 +362,24 @@ static inline int red_action(struct red_parms *p, unsigned long qavg) return RED_DONT_MARK; } +static inline void red_adaptative_algo(struct red_parms *p) +{ + unsigned long qavg; + u32 max_p_delta; + + qavg = p->qavg; + if (red_is_idling(p)) + qavg = red_calc_qavg_from_idle_time(p); + + /* p->qavg is fixed point number with point at Wlog */ + qavg >>= p->Wlog; + + if (qavg > p->target_max && p->max_P <= MAX_P_MAX) + p->max_P += MAX_P_ALPHA(p->max_P); /* maxp = maxp + alpha */ + else if (qavg < p->target_min && p->max_P >= MAX_P_MIN) + p->max_P = (p->max_P/10)*9; /* maxp = maxp * Beta */ + + max_p_delta = DIV_ROUND_CLOSEST(p->max_P, p->qth_delta); + p->max_P_reciprocal = reciprocal_value(max_p_delta); +} #endif diff --git a/lib/reciprocal_div.c b/lib/reciprocal_div.c index 6a3bd48fa2a0..75510e94f7d0 100644 --- a/lib/reciprocal_div.c +++ b/lib/reciprocal_div.c @@ -1,5 +1,6 @@ #include #include +#include u32 reciprocal_value(u32 k) { @@ -7,3 +8,4 @@ u32 reciprocal_value(u32 k) do_div(val, k); return (u32)val; } +EXPORT_SYMBOL(reciprocal_value); diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index d617161f8dd3..8f5a85bf9d10 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -39,6 +39,7 @@ struct red_sched_data { u32 limit; /* HARD maximal queue length */ unsigned char flags; + struct timer_list adapt_timer; struct red_parms parms; struct red_stats stats; struct Qdisc *qdisc; @@ -161,6 +162,8 @@ static void red_reset(struct Qdisc *sch) static void red_destroy(struct Qdisc *sch) { struct red_sched_data *q = qdisc_priv(sch); + + del_timer_sync(&q->adapt_timer); qdisc_destroy(q->qdisc); } @@ -209,6 +212,10 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt) ctl->Plog, ctl->Scell_log, nla_data(tb[TCA_RED_STAB])); + del_timer(&q->adapt_timer); + if (ctl->flags & TC_RED_ADAPTATIVE) + mod_timer(&q->adapt_timer, jiffies + HZ/2); + if (!q->qdisc->q.qlen) red_start_of_idle_period(&q->parms); @@ -216,11 +223,24 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt) return 0; } +static inline void red_adaptative_timer(unsigned long arg) +{ + struct Qdisc *sch = (struct Qdisc *)arg; + struct red_sched_data *q = qdisc_priv(sch); + spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch)); + + spin_lock(root_lock); + red_adaptative_algo(&q->parms); + mod_timer(&q->adapt_timer, jiffies + HZ/2); + spin_unlock(root_lock); +} + static int red_init(struct Qdisc *sch, struct nlattr *opt) { struct red_sched_data *q = qdisc_priv(sch); q->qdisc = &noop_qdisc; + setup_timer(&q->adapt_timer, red_adaptative_timer, (unsigned long)sch); return red_change(sch, opt); } @@ -243,6 +263,7 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb) if (opts == NULL) goto nla_put_failure; NLA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt); + NLA_PUT_U32(skb, TCA_RED_MAX_P, q->parms.max_P); return nla_nest_end(skb, opts); nla_put_failure: -- cgit v1.2.3 From a73ed26bbae7327370c5bd298f07de78df9e3466 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 9 Dec 2011 02:46:45 +0000 Subject: sch_red: generalize accurate MAX_P support to RED/GRED/CHOKE Now RED uses a Q0.32 number to store max_p (max probability), allow RED/GRED/CHOKE to use/report full resolution at config/dump time. Old tc binaries are non aware of new attributes, and still set/get Plog. New tc binary set/get both Plog and max_p for backward compatibility, they display "probability value" if they get max_p from new kernels. # tc -d qdisc show dev ... ... qdisc red 10: parent 1:1 limit 360Kb min 30Kb max 90Kb ecn ewma 5 probability 0.09 Scell_log 15 Make sure we avoid potential divides by 0 in reciprocal_value(), if (max_th - min_th) is big. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/pkt_sched.h | 2 ++ include/net/red.h | 16 +++++++++++----- net/sched/sch_choke.c | 8 +++++++- net/sched/sch_gred.c | 22 ++++++++++++++++++---- net/sched/sch_red.c | 9 +++++++-- 5 files changed, 45 insertions(+), 12 deletions(-) (limited to 'include/linux/pkt_sched.h') diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h index e41e0d4de24b..8786ea741f52 100644 --- a/include/linux/pkt_sched.h +++ b/include/linux/pkt_sched.h @@ -216,6 +216,7 @@ enum { TCA_GRED_PARMS, TCA_GRED_STAB, TCA_GRED_DPS, + TCA_GRED_MAX_P, __TCA_GRED_MAX, }; @@ -255,6 +256,7 @@ enum { TCA_CHOKE_UNSPEC, TCA_CHOKE_PARMS, TCA_CHOKE_STAB, + TCA_CHOKE_MAX_P, __TCA_CHOKE_MAX, }; diff --git a/include/net/red.h b/include/net/red.h index 24606b22d01e..ef715a16cce4 100644 --- a/include/net/red.h +++ b/include/net/red.h @@ -155,9 +155,10 @@ static inline u32 red_maxp(u8 Plog) static inline void red_set_parms(struct red_parms *p, u32 qth_min, u32 qth_max, u8 Wlog, u8 Plog, - u8 Scell_log, u8 *stab) + u8 Scell_log, u8 *stab, u32 max_P) { int delta = qth_max - qth_min; + u32 max_p_delta; /* Reset average queue length, the value is strictly bound * to the parameters below, reseting hurts a bit but leaving @@ -173,10 +174,14 @@ static inline void red_set_parms(struct red_parms *p, if (delta < 0) delta = 1; p->qth_delta = delta; - p->max_P = red_maxp(Plog); - p->max_P *= delta; /* max_P = (qth_max-qth_min)/2^Plog */ - - p->max_P_reciprocal = reciprocal_value(p->max_P / delta); + if (!max_P) { + max_P = red_maxp(Plog); + max_P *= delta; /* max_P = (qth_max - qth_min)/2^Plog */ + } + p->max_P = max_P; + max_p_delta = max_P / delta; + max_p_delta = max(max_p_delta, 1U); + p->max_P_reciprocal = reciprocal_value(max_p_delta); /* RED Adaptative target : * [min_th + 0.4*(min_th - max_th), @@ -380,6 +385,7 @@ static inline void red_adaptative_algo(struct red_parms *p) p->max_P = (p->max_P/10)*9; /* maxp = maxp * Beta */ max_p_delta = DIV_ROUND_CLOSEST(p->max_P, p->qth_delta); + max_p_delta = max(max_p_delta, 1U); p->max_P_reciprocal = reciprocal_value(max_p_delta); } #endif diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index 205d369a217c..bef00acb8bd2 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -394,6 +394,7 @@ static void choke_reset(struct Qdisc *sch) static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = { [TCA_CHOKE_PARMS] = { .len = sizeof(struct tc_red_qopt) }, [TCA_CHOKE_STAB] = { .len = RED_STAB_SIZE }, + [TCA_CHOKE_MAX_P] = { .type = NLA_U32 }, }; @@ -415,6 +416,7 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt) int err; struct sk_buff **old = NULL; unsigned int mask; + u32 max_P; if (opt == NULL) return -EINVAL; @@ -427,6 +429,8 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt) tb[TCA_CHOKE_STAB] == NULL) return -EINVAL; + max_P = tb[TCA_CHOKE_MAX_P] ? nla_get_u32(tb[TCA_CHOKE_MAX_P]) : 0; + ctl = nla_data(tb[TCA_CHOKE_PARMS]); if (ctl->limit > CHOKE_MAX_QUEUE) @@ -476,7 +480,8 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt) red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Plog, ctl->Scell_log, - nla_data(tb[TCA_CHOKE_STAB])); + nla_data(tb[TCA_CHOKE_STAB]), + max_P); if (q->head == q->tail) red_end_of_idle_period(&q->parms); @@ -510,6 +515,7 @@ static int choke_dump(struct Qdisc *sch, struct sk_buff *skb) goto nla_put_failure; NLA_PUT(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt); + NLA_PUT_U32(skb, TCA_CHOKE_MAX_P, q->parms.max_P); return nla_nest_end(skb, opts); nla_put_failure: diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index b9493a09a870..a1b7407ac2a4 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -34,7 +34,7 @@ struct gred_sched; struct gred_sched_data { u32 limit; /* HARD maximal queue length */ - u32 DP; /* the drop pramaters */ + u32 DP; /* the drop parameters */ u32 bytesin; /* bytes seen on virtualQ so far*/ u32 packetsin; /* packets seen on virtualQ so far*/ u32 backlog; /* bytes on the virtualQ */ @@ -379,7 +379,8 @@ static inline int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps) } static inline int gred_change_vq(struct Qdisc *sch, int dp, - struct tc_gred_qopt *ctl, int prio, u8 *stab) + struct tc_gred_qopt *ctl, int prio, + u8 *stab, u32 max_P) { struct gred_sched *table = qdisc_priv(sch); struct gred_sched_data *q; @@ -400,7 +401,7 @@ static inline int gred_change_vq(struct Qdisc *sch, int dp, red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Plog, - ctl->Scell_log, stab); + ctl->Scell_log, stab, max_P); return 0; } @@ -409,6 +410,7 @@ static const struct nla_policy gred_policy[TCA_GRED_MAX + 1] = { [TCA_GRED_PARMS] = { .len = sizeof(struct tc_gred_qopt) }, [TCA_GRED_STAB] = { .len = 256 }, [TCA_GRED_DPS] = { .len = sizeof(struct tc_gred_sopt) }, + [TCA_GRED_MAX_P] = { .type = NLA_U32 }, }; static int gred_change(struct Qdisc *sch, struct nlattr *opt) @@ -418,6 +420,7 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt) struct nlattr *tb[TCA_GRED_MAX + 1]; int err, prio = GRED_DEF_PRIO; u8 *stab; + u32 max_P; if (opt == NULL) return -EINVAL; @@ -433,6 +436,8 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt) tb[TCA_GRED_STAB] == NULL) return -EINVAL; + max_P = tb[TCA_GRED_MAX_P] ? nla_get_u32(tb[TCA_GRED_MAX_P]) : 0; + err = -EINVAL; ctl = nla_data(tb[TCA_GRED_PARMS]); stab = nla_data(tb[TCA_GRED_STAB]); @@ -457,7 +462,7 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt) sch_tree_lock(sch); - err = gred_change_vq(sch, ctl->DP, ctl, prio, stab); + err = gred_change_vq(sch, ctl->DP, ctl, prio, stab, max_P); if (err < 0) goto errout_locked; @@ -498,6 +503,7 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) struct gred_sched *table = qdisc_priv(sch); struct nlattr *parms, *opts = NULL; int i; + u32 max_p[MAX_DPs]; struct tc_gred_sopt sopt = { .DPs = table->DPs, .def_DP = table->def, @@ -509,6 +515,14 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) if (opts == NULL) goto nla_put_failure; NLA_PUT(skb, TCA_GRED_DPS, sizeof(sopt), &sopt); + + for (i = 0; i < MAX_DPs; i++) { + struct gred_sched_data *q = table->tab[i]; + + max_p[i] = q ? q->parms.max_P : 0; + } + NLA_PUT(skb, TCA_GRED_MAX_P, sizeof(max_p), max_p); + parms = nla_nest_start(skb, TCA_GRED_PARMS); if (parms == NULL) goto nla_put_failure; diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 8f5a85bf9d10..ce2256a17d7e 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -170,6 +170,7 @@ static void red_destroy(struct Qdisc *sch) static const struct nla_policy red_policy[TCA_RED_MAX + 1] = { [TCA_RED_PARMS] = { .len = sizeof(struct tc_red_qopt) }, [TCA_RED_STAB] = { .len = RED_STAB_SIZE }, + [TCA_RED_MAX_P] = { .type = NLA_U32 }, }; static int red_change(struct Qdisc *sch, struct nlattr *opt) @@ -179,6 +180,7 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt) struct tc_red_qopt *ctl; struct Qdisc *child = NULL; int err; + u32 max_P; if (opt == NULL) return -EINVAL; @@ -191,6 +193,8 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt) tb[TCA_RED_STAB] == NULL) return -EINVAL; + max_P = tb[TCA_RED_MAX_P] ? nla_get_u32(tb[TCA_RED_MAX_P]) : 0; + ctl = nla_data(tb[TCA_RED_PARMS]); if (ctl->limit > 0) { @@ -209,8 +213,9 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt) } red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog, - ctl->Plog, ctl->Scell_log, - nla_data(tb[TCA_RED_STAB])); + ctl->Plog, ctl->Scell_log, + nla_data(tb[TCA_RED_STAB]), + max_P); del_timer(&q->adapt_timer); if (ctl->flags & TC_RED_ADAPTATIVE) -- cgit v1.2.3 From 90b41a1cd44cc4e507b554ae5a36562a1ba9a4e8 Mon Sep 17 00:00:00 2001 From: Hagen Paul Pfeifer Date: Mon, 12 Dec 2011 14:30:00 +0000 Subject: netem: add cell concept to simulate special MAC behavior This extension can be used to simulate special link layer characteristics. Simulate because packet data is not modified, only the calculation base is changed to delay a packet based on the original packet size and artificial cell information. packet_overhead can be used to simulate a link layer header compression scheme (e.g. set packet_overhead to -20) or with a positive packet_overhead value an additional MAC header can be simulated. It is also possible to "replace" the 14 byte Ethernet header with something else. cell_size and cell_overhead can be used to simulate link layer schemes, based on cells, like some TDMA schemes. Another application area are MAC schemes using a link layer fragmentation with a (small) header each. Cell size is the maximum amount of data bytes within one cell. Cell overhead is an additional variable to change the per-cell-overhead (e.g. 5 byte header per fragment). Example (5 kbit/s, 20 byte per packet overhead, cell-size 100 byte, per cell overhead 5 byte): tc qdisc add dev eth0 root netem rate 5kbit 20 100 5 Signed-off-by: Hagen Paul Pfeifer Signed-off-by: Florian Westphal Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/pkt_sched.h | 3 +++ net/sched/sch_netem.c | 33 +++++++++++++++++++++++++++++---- 2 files changed, 32 insertions(+), 4 deletions(-) (limited to 'include/linux/pkt_sched.h') diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h index 8786ea741f52..8daced32a014 100644 --- a/include/linux/pkt_sched.h +++ b/include/linux/pkt_sched.h @@ -502,6 +502,9 @@ struct tc_netem_corrupt { struct tc_netem_rate { __u32 rate; /* byte/s */ + __s32 packet_overhead; + __u32 cell_size; + __s32 cell_overhead; }; enum { diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 3bfd73344f76..1fa2f903d221 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -80,6 +81,10 @@ struct netem_sched_data { u32 reorder; u32 corrupt; u32 rate; + s32 packet_overhead; + u32 cell_size; + u32 cell_size_reciprocal; + s32 cell_overhead; struct crndstate { u32 last; @@ -299,11 +304,23 @@ static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma, return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu; } -static psched_time_t packet_len_2_sched_time(unsigned int len, u32 rate) +static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sched_data *q) { - u64 ticks = (u64)len * NSEC_PER_SEC; + u64 ticks; - do_div(ticks, rate); + len += q->packet_overhead; + + if (q->cell_size) { + u32 cells = reciprocal_divide(len, q->cell_size_reciprocal); + + if (len > cells * q->cell_size) /* extra cell needed for remainder */ + cells++; + len = cells * (q->cell_size + q->cell_overhead); + } + + ticks = (u64)len * NSEC_PER_SEC; + + do_div(ticks, q->rate); return PSCHED_NS2TICKS(ticks); } @@ -384,7 +401,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (q->rate) { struct sk_buff_head *list = &q->qdisc->q; - delay += packet_len_2_sched_time(skb->len, q->rate); + delay += packet_len_2_sched_time(skb->len, q); if (!skb_queue_empty(list)) { /* @@ -568,6 +585,11 @@ static void get_rate(struct Qdisc *sch, const struct nlattr *attr) const struct tc_netem_rate *r = nla_data(attr); q->rate = r->rate; + q->packet_overhead = r->packet_overhead; + q->cell_size = r->cell_size; + if (q->cell_size) + q->cell_size_reciprocal = reciprocal_value(q->cell_size); + q->cell_overhead = r->cell_overhead; } static int get_loss_clg(struct Qdisc *sch, const struct nlattr *attr) @@ -909,6 +931,9 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) NLA_PUT(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt); rate.rate = q->rate; + rate.packet_overhead = q->packet_overhead; + rate.cell_size = q->cell_size; + rate.cell_overhead = q->cell_overhead; NLA_PUT(skb, TCA_NETEM_RATE, sizeof(rate), &rate); if (dump_loss_model(q, skb) != 0) -- cgit v1.2.3 From 18cb809850fb499ad9bf288696a95f4071f73931 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 4 Jan 2012 14:18:38 +0000 Subject: net_sched: sfq: extend limits SFQ as implemented in Linux is very limited, with at most 127 flows and limit of 127 packets. [ So if 127 flows are active, we have one packet per flow ] This patch brings to SFQ following features to cope with modern needs. - Ability to specify a smaller per flow limit of inflight packets. (default value being at 127 packets) - Ability to have up to 65408 active flows (instead of 127) - Ability to have head drops instead of tail drops (to drop old packets from a flow) Example of use : No more than 20 packets per flow, max 8000 flows, max 20000 packets in SFQ qdisc, hash table of 65536 slots. tc qdisc add ... sfq \ flows 8000 \ depth 20 \ headdrop \ limit 20000 \ divisor 65536 Ram usage : 2 bytes per hash table entry (instead of previous 1 byte/entry) 32 bytes per flow on 64bit arches, instead of 384 for QFQ, so much better cache hit ratio. Signed-off-by: Eric Dumazet CC: Dave Taht Signed-off-by: David S. Miller --- include/linux/pkt_sched.h | 16 ++--- net/sched/sch_sfq.c | 175 +++++++++++++++++++++++++++++++--------------- 2 files changed, 124 insertions(+), 67 deletions(-) (limited to 'include/linux/pkt_sched.h') diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h index 8daced32a014..8f1b928f777c 100644 --- a/include/linux/pkt_sched.h +++ b/include/linux/pkt_sched.h @@ -162,19 +162,17 @@ struct tc_sfq_qopt { unsigned flows; /* Maximal number of flows */ }; +struct tc_sfq_qopt_v1 { + struct tc_sfq_qopt v0; + unsigned int depth; /* max number of packets per flow */ + unsigned int headdrop; +}; + + struct tc_sfq_xstats { __s32 allot; }; -/* - * NOTE: limit, divisor and flows are hardwired to code at the moment. - * - * limit=flows=128, divisor=1024; - * - * The only reason for this is efficiency, it is possible - * to change these parameters in compile time. - */ - /* RED section */ enum { diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 843018154a5c..0a7964009e8c 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -66,16 +66,18 @@ SFQ is superior for this purpose. IMPLEMENTATION: - This implementation limits maximal queue length to 128; - max mtu to 2^18-1; max 128 flows, number of hash buckets to 1024. - The only goal of this restrictions was that all data - fit into one 4K page on 32bit arches. + This implementation limits : + - maximal queue length per flow to 127 packets. + - max mtu to 2^18-1; + - max 65408 flows, + - number of hash buckets to 65536. It is easy to increase these values, but not in flight. */ -#define SFQ_DEPTH 128 /* max number of packets per flow */ -#define SFQ_SLOTS 128 /* max number of flows */ -#define SFQ_EMPTY_SLOT 255 +#define SFQ_MAX_DEPTH 127 /* max number of packets per flow */ +#define SFQ_DEFAULT_FLOWS 128 +#define SFQ_MAX_FLOWS (0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */ +#define SFQ_EMPTY_SLOT 0xffff #define SFQ_DEFAULT_HASH_DIVISOR 1024 /* We use 16 bits to store allot, and want to handle packets up to 64K @@ -84,13 +86,13 @@ #define SFQ_ALLOT_SHIFT 3 #define SFQ_ALLOT_SIZE(X) DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT) -/* This type should contain at least SFQ_DEPTH + SFQ_SLOTS values */ -typedef unsigned char sfq_index; +/* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */ +typedef u16 sfq_index; /* * We dont use pointers to save space. - * Small indexes [0 ... SFQ_SLOTS - 1] are 'pointers' to slots[] array - * while following values [SFQ_SLOTS ... SFQ_SLOTS + SFQ_DEPTH - 1] + * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array + * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH] * are 'pointers' to dep[] array */ struct sfq_head { @@ -102,28 +104,38 @@ struct sfq_slot { struct sk_buff *skblist_next; struct sk_buff *skblist_prev; sfq_index qlen; /* number of skbs in skblist */ - sfq_index next; /* next slot in sfq chain */ + sfq_index next; /* next slot in sfq RR chain */ struct sfq_head dep; /* anchor in dep[] chains */ unsigned short hash; /* hash value (index in ht[]) */ short allot; /* credit for this slot */ }; struct sfq_sched_data { -/* Parameters */ - int perturb_period; - unsigned int quantum; /* Allotment per round: MUST BE >= MTU */ - int limit; +/* frequently used fields */ + int limit; /* limit of total number of packets in this qdisc */ unsigned int divisor; /* number of slots in hash table */ -/* Variables */ - struct tcf_proto *filter_list; - struct timer_list perturb_timer; + unsigned int maxflows; /* number of flows in flows array */ + int headdrop; + int maxdepth; /* limit of packets per flow */ + u32 perturbation; + struct tcf_proto *filter_list; sfq_index cur_depth; /* depth of longest slot */ unsigned short scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */ struct sfq_slot *tail; /* current slot in round */ - sfq_index *ht; /* Hash table (divisor slots) */ - struct sfq_slot slots[SFQ_SLOTS]; - struct sfq_head dep[SFQ_DEPTH]; /* Linked list of slots, indexed by depth */ + sfq_index *ht; /* Hash table ('divisor' slots) */ + struct sfq_slot *slots; /* Flows table ('maxflows' entries) */ + + struct sfq_head dep[SFQ_MAX_DEPTH + 1]; + /* Linked lists of slots, indexed by depth + * dep[0] : list of unused flows + * dep[1] : list of flows with 1 packet + * dep[X] : list of flows with X packets + */ + + int perturb_period; + unsigned int quantum; /* Allotment per round: MUST BE >= MTU */ + struct timer_list perturb_timer; }; /* @@ -131,9 +143,9 @@ struct sfq_sched_data { */ static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val) { - if (val < SFQ_SLOTS) + if (val < SFQ_MAX_FLOWS) return &q->slots[val].dep; - return &q->dep[val - SFQ_SLOTS]; + return &q->dep[val - SFQ_MAX_FLOWS]; } /* @@ -199,18 +211,19 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch, } /* - * x : slot number [0 .. SFQ_SLOTS - 1] + * x : slot number [0 .. SFQ_MAX_FLOWS - 1] */ static inline void sfq_link(struct sfq_sched_data *q, sfq_index x) { sfq_index p, n; - int qlen = q->slots[x].qlen; + struct sfq_slot *slot = &q->slots[x]; + int qlen = slot->qlen; - p = qlen + SFQ_SLOTS; + p = qlen + SFQ_MAX_FLOWS; n = q->dep[qlen].next; - q->slots[x].dep.next = n; - q->slots[x].dep.prev = p; + slot->dep.next = n; + slot->dep.prev = p; q->dep[qlen].next = x; /* sfq_dep_head(q, p)->next = x */ sfq_dep_head(q, n)->prev = x; @@ -275,6 +288,7 @@ static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot) static inline void slot_queue_init(struct sfq_slot *slot) { + memset(slot, 0, sizeof(*slot)); slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot; } @@ -305,7 +319,7 @@ static unsigned int sfq_drop(struct Qdisc *sch) x = q->dep[d].next; slot = &q->slots[x]; drop: - skb = slot_dequeue_tail(slot); + skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot); len = qdisc_pkt_len(skb); sfq_dec(q, x); kfree_skb(skb); @@ -349,16 +363,27 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) slot = &q->slots[x]; if (x == SFQ_EMPTY_SLOT) { x = q->dep[0].next; /* get a free slot */ + if (x >= SFQ_MAX_FLOWS) + return qdisc_drop(skb, sch); q->ht[hash] = x; slot = &q->slots[x]; slot->hash = hash; } - /* If selected queue has length q->limit, do simple tail drop, - * i.e. drop _this_ packet. - */ - if (slot->qlen >= q->limit) - return qdisc_drop(skb, sch); + if (slot->qlen >= q->maxdepth) { + struct sk_buff *head; + + if (!q->headdrop) + return qdisc_drop(skb, sch); + + head = slot_dequeue_head(slot); + sch->qstats.backlog -= qdisc_pkt_len(head); + qdisc_drop(head, sch); + + sch->qstats.backlog += qdisc_pkt_len(skb); + slot_queue_add(slot, skb); + return NET_XMIT_CN; + } sch->qstats.backlog += qdisc_pkt_len(skb); slot_queue_add(slot, skb); @@ -445,16 +470,18 @@ sfq_reset(struct Qdisc *sch) * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change * counters. */ -static void sfq_rehash(struct sfq_sched_data *q) +static void sfq_rehash(struct Qdisc *sch) { + struct sfq_sched_data *q = qdisc_priv(sch); struct sk_buff *skb; int i; struct sfq_slot *slot; struct sk_buff_head list; + int dropped = 0; __skb_queue_head_init(&list); - for (i = 0; i < SFQ_SLOTS; i++) { + for (i = 0; i < q->maxflows; i++) { slot = &q->slots[i]; if (!slot->qlen) continue; @@ -474,10 +501,18 @@ static void sfq_rehash(struct sfq_sched_data *q) slot = &q->slots[x]; if (x == SFQ_EMPTY_SLOT) { x = q->dep[0].next; /* get a free slot */ + if (x >= SFQ_MAX_FLOWS) { +drop: sch->qstats.backlog -= qdisc_pkt_len(skb); + kfree_skb(skb); + dropped++; + continue; + } q->ht[hash] = x; slot = &q->slots[x]; slot->hash = hash; } + if (slot->qlen >= q->maxdepth) + goto drop; slot_queue_add(slot, skb); sfq_inc(q, x); if (slot->qlen == 1) { /* The flow is new */ @@ -491,6 +526,8 @@ static void sfq_rehash(struct sfq_sched_data *q) slot->allot = q->scaled_quantum; } } + sch->q.qlen -= dropped; + qdisc_tree_decrease_qlen(sch, dropped); } static void sfq_perturbation(unsigned long arg) @@ -502,7 +539,7 @@ static void sfq_perturbation(unsigned long arg) spin_lock(root_lock); q->perturbation = net_random(); if (!q->filter_list && q->tail) - sfq_rehash(q); + sfq_rehash(sch); spin_unlock(root_lock); if (q->perturb_period) @@ -513,23 +550,39 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt) { struct sfq_sched_data *q = qdisc_priv(sch); struct tc_sfq_qopt *ctl = nla_data(opt); + struct tc_sfq_qopt_v1 *ctl_v1 = NULL; unsigned int qlen; if (opt->nla_len < nla_attr_size(sizeof(*ctl))) return -EINVAL; - + if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1))) + ctl_v1 = nla_data(opt); if (ctl->divisor && (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536)) return -EINVAL; sch_tree_lock(sch); - q->quantum = ctl->quantum ? : psched_mtu(qdisc_dev(sch)); - q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum); + if (ctl->quantum) { + q->quantum = ctl->quantum; + q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum); + } q->perturb_period = ctl->perturb_period * HZ; - if (ctl->limit) - q->limit = min_t(u32, ctl->limit, SFQ_DEPTH - 1); - if (ctl->divisor) + if (ctl->flows) + q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS); + if (ctl->divisor) { q->divisor = ctl->divisor; + q->maxflows = min_t(u32, q->maxflows, q->divisor); + } + if (ctl_v1) { + if (ctl_v1->depth) + q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH); + q->headdrop = ctl_v1->headdrop; + } + if (ctl->limit) { + q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows); + q->maxflows = min_t(u32, q->maxflows, q->limit); + } + qlen = sch->q.qlen; while (sch->q.qlen > q->limit) sfq_drop(sch); @@ -571,6 +624,7 @@ static void sfq_destroy(struct Qdisc *sch) q->perturb_period = 0; del_timer_sync(&q->perturb_timer); sfq_free(q->ht); + sfq_free(q->slots); } static int sfq_init(struct Qdisc *sch, struct nlattr *opt) @@ -582,15 +636,17 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt) q->perturb_timer.data = (unsigned long)sch; init_timer_deferrable(&q->perturb_timer); - for (i = 0; i < SFQ_DEPTH; i++) { - q->dep[i].next = i + SFQ_SLOTS; - q->dep[i].prev = i + SFQ_SLOTS; + for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) { + q->dep[i].next = i + SFQ_MAX_FLOWS; + q->dep[i].prev = i + SFQ_MAX_FLOWS; } - q->limit = SFQ_DEPTH - 1; + q->limit = SFQ_MAX_DEPTH; + q->maxdepth = SFQ_MAX_DEPTH; q->cur_depth = 0; q->tail = NULL; q->divisor = SFQ_DEFAULT_HASH_DIVISOR; + q->maxflows = SFQ_DEFAULT_FLOWS; q->quantum = psched_mtu(qdisc_dev(sch)); q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum); q->perturb_period = 0; @@ -603,14 +659,15 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt) } q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor); - if (!q->ht) { + q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows); + if (!q->ht || !q->slots) { sfq_destroy(sch); return -ENOMEM; } for (i = 0; i < q->divisor; i++) q->ht[i] = SFQ_EMPTY_SLOT; - for (i = 0; i < SFQ_SLOTS; i++) { + for (i = 0; i < q->maxflows; i++) { slot_queue_init(&q->slots[i]); sfq_link(q, i); } @@ -625,14 +682,16 @@ static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb) { struct sfq_sched_data *q = qdisc_priv(sch); unsigned char *b = skb_tail_pointer(skb); - struct tc_sfq_qopt opt; - - opt.quantum = q->quantum; - opt.perturb_period = q->perturb_period / HZ; - - opt.limit = q->limit; - opt.divisor = q->divisor; - opt.flows = q->limit; + struct tc_sfq_qopt_v1 opt; + + memset(&opt, 0, sizeof(opt)); + opt.v0.quantum = q->quantum; + opt.v0.perturb_period = q->perturb_period / HZ; + opt.v0.limit = q->limit; + opt.v0.divisor = q->divisor; + opt.v0.flows = q->maxflows; + opt.depth = q->maxdepth; + opt.headdrop = q->headdrop; NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); -- cgit v1.2.3 From ddecf0f4db44ef94847a62d6ecf74456b4dcc66f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 6 Jan 2012 06:31:44 +0000 Subject: net_sched: sfq: add optional RED on top of SFQ Adds an optional Random Early Detection on each SFQ flow queue. Traditional SFQ limits count of packets, while RED permits to also control number of bytes per flow, and adds ECN capability as well. 1) We dont handle the idle time management in this RED implementation, since each 'new flow' begins with a null qavg. We really want to address backlogged flows. 2) if headdrop is selected, we try to ecn mark first packet instead of currently enqueued packet. This gives faster feedback for tcp flows compared to traditional RED [ marking the last packet in queue ] Example of use : tc qdisc add dev $DEV parent 1:1 handle 10: est 1sec 4sec sfq \ limit 3000 headdrop flows 512 divisor 16384 \ redflowlimit 100000 min 8000 max 60000 probability 0.20 ecn qdisc sfq 10: parent 1:1 limit 3000p quantum 1514b depth 127 headdrop flows 512/16384 divisor 16384 ewma 6 min 8000b max 60000b probability 0.2 ecn prob_mark 0 prob_mark_head 4876 prob_drop 6131 forced_mark 0 forced_mark_head 0 forced_drop 0 Sent 1175211782 bytes 777537 pkt (dropped 6131, overlimits 11007 requeues 0) rate 99483Kbit 8219pps backlog 689392b 456p requeues 0 In this test, with 64 netperf TCP_STREAM sessions, 50% using ECN enabled flows, we can see number of packets CE marked is smaller than number of drops (for non ECN flows) If same test is run, without RED, we can check backlog is much bigger. qdisc sfq 10: parent 1:1 limit 3000p quantum 1514b depth 127 headdrop flows 512/16384 divisor 16384 Sent 1148683617 bytes 795006 pkt (dropped 0, overlimits 0 requeues 0) rate 98429Kbit 8521pps backlog 1221290b 841p requeues 0 Signed-off-by: Eric Dumazet CC: Stephen Hemminger CC: Dave Taht Tested-by: Dave Taht Signed-off-by: David S. Miller --- include/linux/pkt_sched.h | 20 +++++++ include/net/red.h | 3 +- net/sched/sch_sfq.c | 146 +++++++++++++++++++++++++++++++++++++++++----- 3 files changed, 152 insertions(+), 17 deletions(-) (limited to 'include/linux/pkt_sched.h') diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h index 8f1b928f777c..0d5b79365d03 100644 --- a/include/linux/pkt_sched.h +++ b/include/linux/pkt_sched.h @@ -162,10 +162,30 @@ struct tc_sfq_qopt { unsigned flows; /* Maximal number of flows */ }; +struct tc_sfqred_stats { + __u32 prob_drop; /* Early drops, below max threshold */ + __u32 forced_drop; /* Early drops, after max threshold */ + __u32 prob_mark; /* Marked packets, below max threshold */ + __u32 forced_mark; /* Marked packets, after max threshold */ + __u32 prob_mark_head; /* Marked packets, below max threshold */ + __u32 forced_mark_head;/* Marked packets, after max threshold */ +}; + struct tc_sfq_qopt_v1 { struct tc_sfq_qopt v0; unsigned int depth; /* max number of packets per flow */ unsigned int headdrop; +/* SFQRED parameters */ + __u32 limit; /* HARD maximal flow queue length (bytes) */ + __u32 qth_min; /* Min average length threshold (bytes) */ + __u32 qth_max; /* Max average length threshold (bytes) */ + unsigned char Wlog; /* log(W) */ + unsigned char Plog; /* log(P_max/(qth_max-qth_min)) */ + unsigned char Scell_log; /* cell size for idle damping */ + unsigned char flags; + __u32 max_P; /* probability, high resolution */ +/* SFQRED stats */ + struct tc_sfqred_stats stats; }; diff --git a/include/net/red.h b/include/net/red.h index baab385a4736..28068ec614b2 100644 --- a/include/net/red.h +++ b/include/net/red.h @@ -199,7 +199,8 @@ static inline void red_set_parms(struct red_parms *p, p->Scell_log = Scell_log; p->Scell_max = (255 << Scell_log); - memcpy(p->Stab, stab, sizeof(p->Stab)); + if (stab) + memcpy(p->Stab, stab, sizeof(p->Stab)); } static inline int red_is_idling(const struct red_vars *v) diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 0a7964009e8c..67494aef9acf 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -24,6 +24,7 @@ #include #include #include +#include /* Stochastic Fairness Queuing algorithm. @@ -108,24 +109,30 @@ struct sfq_slot { struct sfq_head dep; /* anchor in dep[] chains */ unsigned short hash; /* hash value (index in ht[]) */ short allot; /* credit for this slot */ + + unsigned int backlog; + struct red_vars vars; }; struct sfq_sched_data { /* frequently used fields */ int limit; /* limit of total number of packets in this qdisc */ unsigned int divisor; /* number of slots in hash table */ - unsigned int maxflows; /* number of flows in flows array */ - int headdrop; - int maxdepth; /* limit of packets per flow */ + u8 headdrop; + u8 maxdepth; /* limit of packets per flow */ u32 perturbation; - struct tcf_proto *filter_list; - sfq_index cur_depth; /* depth of longest slot */ + u8 cur_depth; /* depth of longest slot */ + u8 flags; unsigned short scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */ - struct sfq_slot *tail; /* current slot in round */ + struct tcf_proto *filter_list; sfq_index *ht; /* Hash table ('divisor' slots) */ struct sfq_slot *slots; /* Flows table ('maxflows' entries) */ + struct red_parms *red_parms; + struct tc_sfqred_stats stats; + struct sfq_slot *tail; /* current slot in round */ + struct sfq_head dep[SFQ_MAX_DEPTH + 1]; /* Linked lists of slots, indexed by depth * dep[0] : list of unused flows @@ -133,6 +140,7 @@ struct sfq_sched_data { * dep[X] : list of flows with X packets */ + unsigned int maxflows; /* number of flows in flows array */ int perturb_period; unsigned int quantum; /* Allotment per round: MUST BE >= MTU */ struct timer_list perturb_timer; @@ -321,6 +329,7 @@ static unsigned int sfq_drop(struct Qdisc *sch) drop: skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot); len = qdisc_pkt_len(skb); + slot->backlog -= len; sfq_dec(q, x); kfree_skb(skb); sch->q.qlen--; @@ -341,6 +350,23 @@ drop: return 0; } +/* Is ECN parameter configured */ +static int sfq_prob_mark(const struct sfq_sched_data *q) +{ + return q->flags & TC_RED_ECN; +} + +/* Should packets over max threshold just be marked */ +static int sfq_hard_mark(const struct sfq_sched_data *q) +{ + return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN; +} + +static int sfq_headdrop(const struct sfq_sched_data *q) +{ + return q->headdrop; +} + static int sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) { @@ -349,6 +375,8 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) sfq_index x, qlen; struct sfq_slot *slot; int uninitialized_var(ret); + struct sk_buff *head; + int delta; hash = sfq_classify(skb, sch, &ret); if (hash == 0) { @@ -368,24 +396,75 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) q->ht[hash] = x; slot = &q->slots[x]; slot->hash = hash; + slot->backlog = 0; /* should already be 0 anyway... */ + red_set_vars(&slot->vars); + goto enqueue; } + if (q->red_parms) { + slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms, + &slot->vars, + slot->backlog); + switch (red_action(q->red_parms, + &slot->vars, + slot->vars.qavg)) { + case RED_DONT_MARK: + break; - if (slot->qlen >= q->maxdepth) { - struct sk_buff *head; + case RED_PROB_MARK: + sch->qstats.overlimits++; + if (sfq_prob_mark(q)) { + /* We know we have at least one packet in queue */ + if (sfq_headdrop(q) && + INET_ECN_set_ce(slot->skblist_next)) { + q->stats.prob_mark_head++; + break; + } + if (INET_ECN_set_ce(skb)) { + q->stats.prob_mark++; + break; + } + } + q->stats.prob_drop++; + goto congestion_drop; + + case RED_HARD_MARK: + sch->qstats.overlimits++; + if (sfq_hard_mark(q)) { + /* We know we have at least one packet in queue */ + if (sfq_headdrop(q) && + INET_ECN_set_ce(slot->skblist_next)) { + q->stats.forced_mark_head++; + break; + } + if (INET_ECN_set_ce(skb)) { + q->stats.forced_mark++; + break; + } + } + q->stats.forced_drop++; + goto congestion_drop; + } + } - if (!q->headdrop) + if (slot->qlen >= q->maxdepth) { +congestion_drop: + if (!sfq_headdrop(q)) return qdisc_drop(skb, sch); + /* We know we have at least one packet in queue */ head = slot_dequeue_head(slot); - sch->qstats.backlog -= qdisc_pkt_len(head); + delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb); + sch->qstats.backlog -= delta; + slot->backlog -= delta; qdisc_drop(head, sch); - sch->qstats.backlog += qdisc_pkt_len(skb); slot_queue_add(slot, skb); return NET_XMIT_CN; } +enqueue: sch->qstats.backlog += qdisc_pkt_len(skb); + slot->backlog += qdisc_pkt_len(skb); slot_queue_add(slot, skb); sfq_inc(q, x); if (slot->qlen == 1) { /* The flow is new */ @@ -396,6 +475,7 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) slot->next = q->tail->next; q->tail->next = x; } + /* We could use a bigger initial quantum for new flows */ slot->allot = q->scaled_quantum; } if (++sch->q.qlen <= q->limit) @@ -439,7 +519,7 @@ next_slot: qdisc_bstats_update(sch, skb); sch->q.qlen--; sch->qstats.backlog -= qdisc_pkt_len(skb); - + slot->backlog -= qdisc_pkt_len(skb); /* Is the slot empty? */ if (slot->qlen == 0) { q->ht[slot->hash] = SFQ_EMPTY_SLOT; @@ -490,6 +570,8 @@ static void sfq_rehash(struct Qdisc *sch) sfq_dec(q, i); __skb_queue_tail(&list, skb); } + slot->backlog = 0; + red_set_vars(&slot->vars); q->ht[slot->hash] = SFQ_EMPTY_SLOT; } q->tail = NULL; @@ -514,6 +596,11 @@ drop: sch->qstats.backlog -= qdisc_pkt_len(skb); if (slot->qlen >= q->maxdepth) goto drop; slot_queue_add(slot, skb); + if (q->red_parms) + slot->vars.qavg = red_calc_qavg(q->red_parms, + &slot->vars, + slot->backlog); + slot->backlog += qdisc_pkt_len(skb); sfq_inc(q, x); if (slot->qlen == 1) { /* The flow is new */ if (q->tail == NULL) { /* It is the first flow */ @@ -552,6 +639,7 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt) struct tc_sfq_qopt *ctl = nla_data(opt); struct tc_sfq_qopt_v1 *ctl_v1 = NULL; unsigned int qlen; + struct red_parms *p = NULL; if (opt->nla_len < nla_attr_size(sizeof(*ctl))) return -EINVAL; @@ -560,7 +648,11 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt) if (ctl->divisor && (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536)) return -EINVAL; - + if (ctl_v1 && ctl_v1->qth_min) { + p = kmalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return -ENOMEM; + } sch_tree_lock(sch); if (ctl->quantum) { q->quantum = ctl->quantum; @@ -576,6 +668,16 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt) if (ctl_v1) { if (ctl_v1->depth) q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH); + if (p) { + swap(q->red_parms, p); + red_set_parms(q->red_parms, + ctl_v1->qth_min, ctl_v1->qth_max, + ctl_v1->Wlog, + ctl_v1->Plog, ctl_v1->Scell_log, + NULL, + ctl_v1->max_P); + } + q->flags = ctl_v1->flags; q->headdrop = ctl_v1->headdrop; } if (ctl->limit) { @@ -594,6 +696,7 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt) q->perturbation = net_random(); } sch_tree_unlock(sch); + kfree(p); return 0; } @@ -625,6 +728,7 @@ static void sfq_destroy(struct Qdisc *sch) del_timer_sync(&q->perturb_timer); sfq_free(q->ht); sfq_free(q->slots); + kfree(q->red_parms); } static int sfq_init(struct Qdisc *sch, struct nlattr *opt) @@ -683,6 +787,7 @@ static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb) struct sfq_sched_data *q = qdisc_priv(sch); unsigned char *b = skb_tail_pointer(skb); struct tc_sfq_qopt_v1 opt; + struct red_parms *p = q->red_parms; memset(&opt, 0, sizeof(opt)); opt.v0.quantum = q->quantum; @@ -693,6 +798,17 @@ static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb) opt.depth = q->maxdepth; opt.headdrop = q->headdrop; + if (p) { + opt.qth_min = p->qth_min >> p->Wlog; + opt.qth_max = p->qth_max >> p->Wlog; + opt.Wlog = p->Wlog; + opt.Plog = p->Plog; + opt.Scell_log = p->Scell_log; + opt.max_P = p->max_P; + } + memcpy(&opt.stats, &q->stats, sizeof(opt.stats)); + opt.flags = q->flags; + NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); return skb->len; @@ -747,15 +863,13 @@ static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl, sfq_index idx = q->ht[cl - 1]; struct gnet_stats_queue qs = { 0 }; struct tc_sfq_xstats xstats = { 0 }; - struct sk_buff *skb; if (idx != SFQ_EMPTY_SLOT) { const struct sfq_slot *slot = &q->slots[idx]; xstats.allot = slot->allot << SFQ_ALLOT_SHIFT; qs.qlen = slot->qlen; - slot_queue_walk(slot, skb) - qs.backlog += qdisc_pkt_len(skb); + qs.backlog = slot->backlog; } if (gnet_stats_copy_queue(d, &qs) < 0) return -1; -- cgit v1.2.3