diff options
author | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2013-05-01 08:47:44 -0700 |
---|---|---|
committer | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2013-05-01 08:47:44 -0700 |
commit | bf61c8840efe60fd8f91446860b63338fb424158 (patch) | |
tree | 7a71832407a4f0d6346db773343f4c3ae2257b19 /net/sched | |
parent | 5846115b30f3a881e542c8bfde59a699c1c13740 (diff) | |
parent | 0c6a61657da78098472fd0eb71cc01f2387fa1bb (diff) |
Merge branch 'next' into for-linus
Prepare first set of updates for 3.10 merge window.
Diffstat (limited to 'net/sched')
31 files changed, 992 insertions, 527 deletions
diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 62fb51face8a..235e01acac51 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -509,7 +509,7 @@ config NET_EMATCH_TEXT config NET_EMATCH_CANID tristate "CAN Identifier" - depends on NET_EMATCH && CAN + depends on NET_EMATCH && (CAN=y || CAN=m) ---help--- Say Y here if you want to be able to classify CAN frames based on CAN Identifier. diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 102761d294cb..8579c4bb20c9 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -485,8 +485,9 @@ errout: return err; } -struct tc_action *tcf_action_init_1(struct nlattr *nla, struct nlattr *est, - char *name, int ovr, int bind) +struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla, + struct nlattr *est, char *name, int ovr, + int bind) { struct tc_action *a; struct tc_action_ops *a_o; @@ -542,9 +543,9 @@ struct tc_action *tcf_action_init_1(struct nlattr *nla, struct nlattr *est, /* backward compatibility for policer */ if (name == NULL) - err = a_o->init(tb[TCA_ACT_OPTIONS], est, a, ovr, bind); + err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, a, ovr, bind); else - err = a_o->init(nla, est, a, ovr, bind); + err = a_o->init(net, nla, est, a, ovr, bind); if (err < 0) goto err_free; @@ -566,8 +567,9 @@ err_out: return ERR_PTR(err); } -struct tc_action *tcf_action_init(struct nlattr *nla, struct nlattr *est, - char *name, int ovr, int bind) +struct tc_action *tcf_action_init(struct net *net, struct nlattr *nla, + struct nlattr *est, char *name, int ovr, + int bind) { struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; struct tc_action *head = NULL, *act, *act_prev = NULL; @@ -579,7 +581,7 @@ struct tc_action *tcf_action_init(struct nlattr *nla, struct nlattr *est, return ERR_PTR(err); for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { - act = tcf_action_init_1(tb[i], est, name, ovr, bind); + act = tcf_action_init_1(net, tb[i], est, name, ovr, bind); if (IS_ERR(act)) goto err; act->order = i; @@ -960,7 +962,7 @@ tcf_action_add(struct net *net, struct nlattr *nla, struct nlmsghdr *n, struct tc_action *a; u32 seq = n->nlmsg_seq; - act = tcf_action_init(nla, NULL, NULL, ovr, 0); + act = tcf_action_init(net, nla, NULL, NULL, ovr, 0); if (act == NULL) goto done; if (IS_ERR(act)) { @@ -987,6 +989,9 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, void *arg) u32 portid = skb ? NETLINK_CB(skb).portid : 0; int ret = 0, ovr = 0; + if ((n->nlmsg_type != RTM_GETACTION) && !capable(CAP_NET_ADMIN)) + return -EPERM; + ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ACT_MAX, NULL); if (ret < 0) return ret; diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 2c8ad7c86e43..08fa1e8a4ca4 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -51,7 +51,7 @@ static const struct nla_policy csum_policy[TCA_CSUM_MAX + 1] = { [TCA_CSUM_PARMS] = { .len = sizeof(struct tc_csum), }, }; -static int tcf_csum_init(struct nlattr *nla, struct nlattr *est, +static int tcf_csum_init(struct net *n, struct nlattr *nla, struct nlattr *est, struct tc_action *a, int ovr, int bind) { struct nlattr *tb[TCA_CSUM_MAX + 1]; diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index 05d60859d8e3..fd2b3cff5fa2 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -58,8 +58,9 @@ static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = { [TCA_GACT_PROB] = { .len = sizeof(struct tc_gact_p) }, }; -static int tcf_gact_init(struct nlattr *nla, struct nlattr *est, - struct tc_action *a, int ovr, int bind) +static int tcf_gact_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action *a, + int ovr, int bind) { struct nlattr *tb[TCA_GACT_MAX + 1]; struct tc_gact *parm; diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 58fb3c7aab9e..e0f6de64afec 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -102,7 +102,7 @@ static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = { [TCA_IPT_TARG] = { .len = sizeof(struct xt_entry_target) }, }; -static int tcf_ipt_init(struct nlattr *nla, struct nlattr *est, +static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action *a, int ovr, int bind) { struct nlattr *tb[TCA_IPT_MAX + 1]; @@ -207,10 +207,8 @@ static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a, struct tcf_ipt *ipt = a->priv; struct xt_action_param par; - if (skb_cloned(skb)) { - if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) - return TC_ACT_UNSPEC; - } + if (skb_unclone(skb, GFP_ATOMIC)) + return TC_ACT_UNSPEC; spin_lock(&ipt->tcf_lock); diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 9c0fd0c78814..5d676edc22a6 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -62,8 +62,9 @@ static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = { [TCA_MIRRED_PARMS] = { .len = sizeof(struct tc_mirred) }, }; -static int tcf_mirred_init(struct nlattr *nla, struct nlattr *est, - struct tc_action *a, int ovr, int bind) +static int tcf_mirred_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action *a, int ovr, + int bind) { struct nlattr *tb[TCA_MIRRED_MAX + 1]; struct tc_mirred *parm; @@ -88,7 +89,7 @@ static int tcf_mirred_init(struct nlattr *nla, struct nlattr *est, return -EINVAL; } if (parm->ifindex) { - dev = __dev_get_by_index(&init_net, parm->ifindex); + dev = __dev_get_by_index(net, parm->ifindex); if (dev == NULL) return -ENODEV; switch (dev->type) { diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index b5d029eb44f2..876f0ef29694 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -44,7 +44,7 @@ static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = { [TCA_NAT_PARMS] = { .len = sizeof(struct tc_nat) }, }; -static int tcf_nat_init(struct nlattr *nla, struct nlattr *est, +static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action *a, int ovr, int bind) { struct nlattr *tb[TCA_NAT_MAX + 1]; diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 45c53ab067a6..7ed78c9e505c 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -38,8 +38,9 @@ static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = { [TCA_PEDIT_PARMS] = { .len = sizeof(struct tc_pedit) }, }; -static int tcf_pedit_init(struct nlattr *nla, struct nlattr *est, - struct tc_action *a, int ovr, int bind) +static int tcf_pedit_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action *a, + int ovr, int bind) { struct nlattr *tb[TCA_PEDIT_MAX + 1]; struct tc_pedit *parm; @@ -130,8 +131,7 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, int i, munged = 0; unsigned int off; - if (skb_cloned(skb) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + if (skb_unclone(skb, GFP_ATOMIC)) return p->tcf_action; off = skb_network_offset(skb); diff --git a/net/sched/act_police.c b/net/sched/act_police.c index a9de23297d47..823463adbd21 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -22,8 +22,23 @@ #include <net/act_api.h> #include <net/netlink.h> -#define L2T(p, L) qdisc_l2t((p)->tcfp_R_tab, L) -#define L2T_P(p, L) qdisc_l2t((p)->tcfp_P_tab, L) +struct tcf_police { + struct tcf_common common; + int tcfp_result; + u32 tcfp_ewma_rate; + s64 tcfp_burst; + u32 tcfp_mtu; + s64 tcfp_toks; + s64 tcfp_ptoks; + s64 tcfp_mtu_ptoks; + s64 tcfp_t_c; + struct psched_ratecfg rate; + bool rate_present; + struct psched_ratecfg peak; + bool peak_present; +}; +#define to_police(pc) \ + container_of(pc, struct tcf_police, common) #define POL_TAB_MASK 15 static struct tcf_common *tcf_police_ht[POL_TAB_MASK + 1]; @@ -108,10 +123,6 @@ static void tcf_police_destroy(struct tcf_police *p) write_unlock_bh(&police_lock); gen_kill_estimator(&p->tcf_bstats, &p->tcf_rate_est); - if (p->tcfp_R_tab) - qdisc_put_rtab(p->tcfp_R_tab); - if (p->tcfp_P_tab) - qdisc_put_rtab(p->tcfp_P_tab); /* * gen_estimator est_timer() might access p->tcf_lock * or bstats, wait a RCU grace period before freeing p @@ -130,8 +141,9 @@ static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = { [TCA_POLICE_RESULT] = { .type = NLA_U32 }, }; -static int tcf_act_police_locate(struct nlattr *nla, struct nlattr *est, - struct tc_action *a, int ovr, int bind) +static int tcf_act_police_locate(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action *a, + int ovr, int bind) { unsigned int h; int ret = 0, err; @@ -211,26 +223,36 @@ override: } /* No failure allowed after this point */ - if (R_tab != NULL) { - qdisc_put_rtab(police->tcfp_R_tab); - police->tcfp_R_tab = R_tab; + police->tcfp_mtu = parm->mtu; + if (police->tcfp_mtu == 0) { + police->tcfp_mtu = ~0; + if (R_tab) + police->tcfp_mtu = 255 << R_tab->rate.cell_log; + } + if (R_tab) { + police->rate_present = true; + psched_ratecfg_precompute(&police->rate, R_tab->rate.rate); + qdisc_put_rtab(R_tab); + } else { + police->rate_present = false; } - if (P_tab != NULL) { - qdisc_put_rtab(police->tcfp_P_tab); - police->tcfp_P_tab = P_tab; + if (P_tab) { + police->peak_present = true; + psched_ratecfg_precompute(&police->peak, P_tab->rate.rate); + qdisc_put_rtab(P_tab); + } else { + police->peak_present = false; } if (tb[TCA_POLICE_RESULT]) police->tcfp_result = nla_get_u32(tb[TCA_POLICE_RESULT]); - police->tcfp_toks = police->tcfp_burst = parm->burst; - police->tcfp_mtu = parm->mtu; - if (police->tcfp_mtu == 0) { - police->tcfp_mtu = ~0; - if (police->tcfp_R_tab) - police->tcfp_mtu = 255<<police->tcfp_R_tab->rate.cell_log; + police->tcfp_burst = PSCHED_TICKS2NS(parm->burst); + police->tcfp_toks = police->tcfp_burst; + if (police->peak_present) { + police->tcfp_mtu_ptoks = (s64) psched_l2t_ns(&police->peak, + police->tcfp_mtu); + police->tcfp_ptoks = police->tcfp_mtu_ptoks; } - if (police->tcfp_P_tab) - police->tcfp_ptoks = L2T_P(police, police->tcfp_mtu); police->tcf_action = parm->action; if (tb[TCA_POLICE_AVRATE]) @@ -240,7 +262,7 @@ override: if (ret != ACT_P_CREATED) return ret; - police->tcfp_t_c = psched_get_time(); + police->tcfp_t_c = ktime_to_ns(ktime_get()); police->tcf_index = parm->index ? parm->index : tcf_hash_new_index(&police_idx_gen, &police_hash_info); h = tcf_hash(police->tcf_index, POL_TAB_MASK); @@ -286,9 +308,9 @@ static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_police *police = a->priv; - psched_time_t now; - long toks; - long ptoks = 0; + s64 now; + s64 toks; + s64 ptoks = 0; spin_lock(&police->tcf_lock); @@ -304,24 +326,25 @@ static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a, } if (qdisc_pkt_len(skb) <= police->tcfp_mtu) { - if (police->tcfp_R_tab == NULL) { + if (!police->rate_present) { spin_unlock(&police->tcf_lock); return police->tcfp_result; } - now = psched_get_time(); - toks = psched_tdiff_bounded(now, police->tcfp_t_c, - police->tcfp_burst); - if (police->tcfp_P_tab) { + now = ktime_to_ns(ktime_get()); + toks = min_t(s64, now - police->tcfp_t_c, + police->tcfp_burst); + if (police->peak_present) { ptoks = toks + police->tcfp_ptoks; - if (ptoks > (long)L2T_P(police, police->tcfp_mtu)) - ptoks = (long)L2T_P(police, police->tcfp_mtu); - ptoks -= L2T_P(police, qdisc_pkt_len(skb)); + if (ptoks > police->tcfp_mtu_ptoks) + ptoks = police->tcfp_mtu_ptoks; + ptoks -= (s64) psched_l2t_ns(&police->peak, + qdisc_pkt_len(skb)); } toks += police->tcfp_toks; - if (toks > (long)police->tcfp_burst) + if (toks > police->tcfp_burst) toks = police->tcfp_burst; - toks -= L2T(police, qdisc_pkt_len(skb)); + toks -= (s64) psched_l2t_ns(&police->rate, qdisc_pkt_len(skb)); if ((toks|ptoks) >= 0) { police->tcfp_t_c = now; police->tcfp_toks = toks; @@ -347,15 +370,15 @@ tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) .index = police->tcf_index, .action = police->tcf_action, .mtu = police->tcfp_mtu, - .burst = police->tcfp_burst, + .burst = PSCHED_NS2TICKS(police->tcfp_burst), .refcnt = police->tcf_refcnt - ref, .bindcnt = police->tcf_bindcnt - bind, }; - if (police->tcfp_R_tab) - opt.rate = police->tcfp_R_tab->rate; - if (police->tcfp_P_tab) - opt.peakrate = police->tcfp_P_tab->rate; + if (police->rate_present) + opt.rate.rate = psched_ratecfg_getrate(&police->rate); + if (police->peak_present) + opt.peakrate.rate = psched_ratecfg_getrate(&police->peak); if (nla_put(skb, TCA_POLICE_TBF, sizeof(opt), &opt)) goto nla_put_failure; if (police->tcfp_result && diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 3714f60f0b3c..7725eb4ab756 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -95,8 +95,9 @@ static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = { [TCA_DEF_DATA] = { .type = NLA_STRING, .len = SIMP_MAX_DATA }, }; -static int tcf_simp_init(struct nlattr *nla, struct nlattr *est, - struct tc_action *a, int ovr, int bind) +static int tcf_simp_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action *a, + int ovr, int bind) { struct nlattr *tb[TCA_DEF_MAX + 1]; struct tc_defact *parm; diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 476e0fac6712..cb4221171f93 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -67,8 +67,9 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = { [TCA_SKBEDIT_MARK] = { .len = sizeof(u32) }, }; -static int tcf_skbedit_init(struct nlattr *nla, struct nlattr *est, - struct tc_action *a, int ovr, int bind) +static int tcf_skbedit_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action *a, + int ovr, int bind) { struct nlattr *tb[TCA_SKBEDIT_MAX + 1]; struct tc_skbedit *parm; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 7ae02892437c..964f5e4f4b8a 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -139,6 +139,8 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *arg) int err; int tp_created = 0; + if ((n->nlmsg_type != RTM_GETTFILTER) && !capable(CAP_NET_ADMIN)) + return -EPERM; replay: t = nlmsg_data(n); protocol = TC_H_MIN(t->tcm_info); @@ -319,7 +321,7 @@ replay: } } - err = tp->ops->change(skb, tp, cl, t->tcm_handle, tca, &fh); + err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh); if (err == 0) { if (tp_created) { spin_lock_bh(root_lock); @@ -506,7 +508,7 @@ void tcf_exts_destroy(struct tcf_proto *tp, struct tcf_exts *exts) } EXPORT_SYMBOL(tcf_exts_destroy); -int tcf_exts_validate(struct tcf_proto *tp, struct nlattr **tb, +int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, struct nlattr *rate_tlv, struct tcf_exts *exts, const struct tcf_ext_map *map) { @@ -517,7 +519,7 @@ int tcf_exts_validate(struct tcf_proto *tp, struct nlattr **tb, struct tc_action *act; if (map->police && tb[map->police]) { - act = tcf_action_init_1(tb[map->police], rate_tlv, + act = tcf_action_init_1(net, tb[map->police], rate_tlv, "police", TCA_ACT_NOREPLACE, TCA_ACT_BIND); if (IS_ERR(act)) @@ -526,8 +528,9 @@ int tcf_exts_validate(struct tcf_proto *tp, struct nlattr **tb, act->type = TCA_OLD_COMPAT; exts->action = act; } else if (map->action && tb[map->action]) { - act = tcf_action_init(tb[map->action], rate_tlv, NULL, - TCA_ACT_NOREPLACE, TCA_ACT_BIND); + act = tcf_action_init(net, tb[map->action], rate_tlv, + NULL, TCA_ACT_NOREPLACE, + TCA_ACT_BIND); if (IS_ERR(act)) return PTR_ERR(act); diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index 344a11b342e5..d76a35d0dc85 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -132,15 +132,16 @@ static const struct nla_policy basic_policy[TCA_BASIC_MAX + 1] = { [TCA_BASIC_EMATCHES] = { .type = NLA_NESTED }, }; -static int basic_set_parms(struct tcf_proto *tp, struct basic_filter *f, - unsigned long base, struct nlattr **tb, +static int basic_set_parms(struct net *net, struct tcf_proto *tp, + struct basic_filter *f, unsigned long base, + struct nlattr **tb, struct nlattr *est) { int err = -EINVAL; struct tcf_exts e; struct tcf_ematch_tree t; - err = tcf_exts_validate(tp, tb, est, &e, &basic_ext_map); + err = tcf_exts_validate(net, tp, tb, est, &e, &basic_ext_map); if (err < 0) return err; @@ -162,7 +163,7 @@ errout: return err; } -static int basic_change(struct sk_buff *in_skb, +static int basic_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, unsigned long *arg) { @@ -182,7 +183,7 @@ static int basic_change(struct sk_buff *in_skb, if (f != NULL) { if (handle && f->handle != handle) return -EINVAL; - return basic_set_parms(tp, f, base, tb, tca[TCA_RATE]); + return basic_set_parms(net, tp, f, base, tb, tca[TCA_RATE]); } err = -ENOBUFS; @@ -208,7 +209,7 @@ static int basic_change(struct sk_buff *in_skb, f->handle = head->hgenerator; } - err = basic_set_parms(tp, f, base, tb, tca[TCA_RATE]); + err = basic_set_parms(net, tp, f, base, tb, tca[TCA_RATE]); if (err < 0) goto errout; diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index 2ecde225ae60..3a294eb98d61 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -17,6 +17,7 @@ #include <linux/skbuff.h> #include <linux/cgroup.h> #include <linux/rcupdate.h> +#include <linux/fdtable.h> #include <net/rtnetlink.h> #include <net/pkt_cls.h> #include <net/sock.h> @@ -34,25 +35,51 @@ static inline struct cgroup_cls_state *task_cls_state(struct task_struct *p) struct cgroup_cls_state, css); } -static struct cgroup_subsys_state *cgrp_create(struct cgroup *cgrp) +static struct cgroup_subsys_state *cgrp_css_alloc(struct cgroup *cgrp) { struct cgroup_cls_state *cs; cs = kzalloc(sizeof(*cs), GFP_KERNEL); if (!cs) return ERR_PTR(-ENOMEM); + return &cs->css; +} +static int cgrp_css_online(struct cgroup *cgrp) +{ if (cgrp->parent) - cs->classid = cgrp_cls_state(cgrp->parent)->classid; - - return &cs->css; + cgrp_cls_state(cgrp)->classid = + cgrp_cls_state(cgrp->parent)->classid; + return 0; } -static void cgrp_destroy(struct cgroup *cgrp) +static void cgrp_css_free(struct cgroup *cgrp) { kfree(cgrp_cls_state(cgrp)); } +static int update_classid(const void *v, struct file *file, unsigned n) +{ + int err; + struct socket *sock = sock_from_file(file, &err); + if (sock) + sock->sk->sk_classid = (u32)(unsigned long)v; + return 0; +} + +static void cgrp_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +{ + struct task_struct *p; + void *v; + + cgroup_taskset_for_each(p, cgrp, tset) { + task_lock(p); + v = (void *)(unsigned long)task_cls_classid(p); + iterate_fd(p->files, 0, update_classid, v); + task_unlock(p); + } +} + static u64 read_classid(struct cgroup *cgrp, struct cftype *cft) { return cgrp_cls_state(cgrp)->classid; @@ -75,20 +102,13 @@ static struct cftype ss_files[] = { struct cgroup_subsys net_cls_subsys = { .name = "net_cls", - .create = cgrp_create, - .destroy = cgrp_destroy, + .css_alloc = cgrp_css_alloc, + .css_online = cgrp_css_online, + .css_free = cgrp_css_free, + .attach = cgrp_attach, .subsys_id = net_cls_subsys_id, .base_cftypes = ss_files, .module = THIS_MODULE, - - /* - * While net_cls cgroup has the rudimentary hierarchy support of - * inheriting the parent's classid on cgroup creation, it doesn't - * properly propagates config changes in ancestors to their - * descendents. A child should follow the parent's configuration - * but be allowed to override it. Fix it and remove the following. - */ - .broken_hierarchy = true, }; struct cls_cgroup_head { @@ -158,7 +178,7 @@ static const struct nla_policy cgroup_policy[TCA_CGROUP_MAX + 1] = { [TCA_CGROUP_EMATCHES] = { .type = NLA_NESTED }, }; -static int cls_cgroup_change(struct sk_buff *in_skb, +static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, unsigned long *arg) @@ -195,7 +215,8 @@ static int cls_cgroup_change(struct sk_buff *in_skb, if (err < 0) return err; - err = tcf_exts_validate(tp, tb, tca[TCA_RATE], &e, &cgroup_ext_map); + err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, + &cgroup_ext_map); if (err < 0) return err; diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index ce82d0cb1b47..aa36a8c8b33b 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -351,7 +351,7 @@ static const struct nla_policy flow_policy[TCA_FLOW_MAX + 1] = { [TCA_FLOW_PERTURB] = { .type = NLA_U32 }, }; -static int flow_change(struct sk_buff *in_skb, +static int flow_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, unsigned long *arg) @@ -397,7 +397,7 @@ static int flow_change(struct sk_buff *in_skb, return -EOPNOTSUPP; } - err = tcf_exts_validate(tp, tb, tca[TCA_RATE], &e, &flow_ext_map); + err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, &flow_ext_map); if (err < 0) return err; diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index 4075a0aef2aa..1135d8227f9b 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -192,7 +192,7 @@ static const struct nla_policy fw_policy[TCA_FW_MAX + 1] = { }; static int -fw_change_attrs(struct tcf_proto *tp, struct fw_filter *f, +fw_change_attrs(struct net *net, struct tcf_proto *tp, struct fw_filter *f, struct nlattr **tb, struct nlattr **tca, unsigned long base) { struct fw_head *head = (struct fw_head *)tp->root; @@ -200,7 +200,7 @@ fw_change_attrs(struct tcf_proto *tp, struct fw_filter *f, u32 mask; int err; - err = tcf_exts_validate(tp, tb, tca[TCA_RATE], &e, &fw_ext_map); + err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, &fw_ext_map); if (err < 0) return err; @@ -233,7 +233,7 @@ errout: return err; } -static int fw_change(struct sk_buff *in_skb, +static int fw_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, @@ -255,7 +255,7 @@ static int fw_change(struct sk_buff *in_skb, if (f != NULL) { if (f->id != handle && handle) return -EINVAL; - return fw_change_attrs(tp, f, tb, tca, base); + return fw_change_attrs(net, tp, f, tb, tca, base); } if (!handle) @@ -282,7 +282,7 @@ static int fw_change(struct sk_buff *in_skb, f->id = handle; - err = fw_change_attrs(tp, f, tb, tca, base); + err = fw_change_attrs(net, tp, f, tb, tca, base); if (err < 0) goto errout; diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index c10d57bf98f2..37da567d833e 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -335,9 +335,10 @@ static const struct nla_policy route4_policy[TCA_ROUTE4_MAX + 1] = { [TCA_ROUTE4_IIF] = { .type = NLA_U32 }, }; -static int route4_set_parms(struct tcf_proto *tp, unsigned long base, - struct route4_filter *f, u32 handle, struct route4_head *head, - struct nlattr **tb, struct nlattr *est, int new) +static int route4_set_parms(struct net *net, struct tcf_proto *tp, + unsigned long base, struct route4_filter *f, + u32 handle, struct route4_head *head, + struct nlattr **tb, struct nlattr *est, int new) { int err; u32 id = 0, to = 0, nhandle = 0x8000; @@ -346,7 +347,7 @@ static int route4_set_parms(struct tcf_proto *tp, unsigned long base, struct route4_bucket *b; struct tcf_exts e; - err = tcf_exts_validate(tp, tb, est, &e, &route_ext_map); + err = tcf_exts_validate(net, tp, tb, est, &e, &route_ext_map); if (err < 0) return err; @@ -427,7 +428,7 @@ errout: return err; } -static int route4_change(struct sk_buff *in_skb, +static int route4_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, @@ -457,7 +458,7 @@ static int route4_change(struct sk_buff *in_skb, if (f->bkt) old_handle = f->handle; - err = route4_set_parms(tp, base, f, handle, head, tb, + err = route4_set_parms(net, tp, base, f, handle, head, tb, tca[TCA_RATE], 0); if (err < 0) return err; @@ -480,7 +481,7 @@ static int route4_change(struct sk_buff *in_skb, if (f == NULL) goto errout; - err = route4_set_parms(tp, base, f, handle, head, tb, + err = route4_set_parms(net, tp, base, f, handle, head, tb, tca[TCA_RATE], 1); if (err < 0) goto errout; diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h index 494bbb90924a..252d8b05872e 100644 --- a/net/sched/cls_rsvp.h +++ b/net/sched/cls_rsvp.h @@ -416,7 +416,7 @@ static const struct nla_policy rsvp_policy[TCA_RSVP_MAX + 1] = { [TCA_RSVP_PINFO] = { .len = sizeof(struct tc_rsvp_pinfo) }, }; -static int rsvp_change(struct sk_buff *in_skb, +static int rsvp_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, @@ -440,7 +440,7 @@ static int rsvp_change(struct sk_buff *in_skb, if (err < 0) return err; - err = tcf_exts_validate(tp, tb, tca[TCA_RATE], &e, &rsvp_ext_map); + err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, &rsvp_ext_map); if (err < 0) return err; diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index a1293b4ab7a1..b86535a40169 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -197,9 +197,10 @@ static const struct nla_policy tcindex_policy[TCA_TCINDEX_MAX + 1] = { }; static int -tcindex_set_parms(struct tcf_proto *tp, unsigned long base, u32 handle, - struct tcindex_data *p, struct tcindex_filter_result *r, - struct nlattr **tb, struct nlattr *est) +tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, + u32 handle, struct tcindex_data *p, + struct tcindex_filter_result *r, struct nlattr **tb, + struct nlattr *est) { int err, balloc = 0; struct tcindex_filter_result new_filter_result, *old_r = r; @@ -208,7 +209,7 @@ tcindex_set_parms(struct tcf_proto *tp, unsigned long base, u32 handle, struct tcindex_filter *f = NULL; /* make gcc behave */ struct tcf_exts e; - err = tcf_exts_validate(tp, tb, est, &e, &tcindex_ext_map); + err = tcf_exts_validate(net, tp, tb, est, &e, &tcindex_ext_map); if (err < 0) return err; @@ -332,7 +333,7 @@ errout: } static int -tcindex_change(struct sk_buff *in_skb, +tcindex_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, unsigned long *arg) { @@ -353,7 +354,8 @@ tcindex_change(struct sk_buff *in_skb, if (err < 0) return err; - return tcindex_set_parms(tp, base, handle, p, r, tb, tca[TCA_RATE]); + return tcindex_set_parms(net, tp, base, handle, p, r, tb, + tca[TCA_RATE]); } diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index c7c27bc91b5a..eb07a1e536e6 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -488,15 +488,15 @@ static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = { [TCA_U32_MARK] = { .len = sizeof(struct tc_u32_mark) }, }; -static int u32_set_parms(struct tcf_proto *tp, unsigned long base, - struct tc_u_hnode *ht, +static int u32_set_parms(struct net *net, struct tcf_proto *tp, + unsigned long base, struct tc_u_hnode *ht, struct tc_u_knode *n, struct nlattr **tb, struct nlattr *est) { int err; struct tcf_exts e; - err = tcf_exts_validate(tp, tb, est, &e, &u32_ext_map); + err = tcf_exts_validate(net, tp, tb, est, &e, &u32_ext_map); if (err < 0) return err; @@ -544,7 +544,7 @@ errout: return err; } -static int u32_change(struct sk_buff *in_skb, +static int u32_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, unsigned long *arg) @@ -570,7 +570,8 @@ static int u32_change(struct sk_buff *in_skb, if (TC_U32_KEY(n->handle) == 0) return -EINVAL; - return u32_set_parms(tp, base, n->ht_up, n, tb, tca[TCA_RATE]); + return u32_set_parms(net, tp, base, n->ht_up, n, tb, + tca[TCA_RATE]); } if (tb[TCA_U32_DIVISOR]) { @@ -656,7 +657,7 @@ static int u32_change(struct sk_buff *in_skb, } #endif - err = u32_set_parms(tp, base, ht, n, tb, tca[TCA_RATE]); + err = u32_set_parms(net, tp, base, ht, n, tb, tca[TCA_RATE]); if (err == 0) { struct tc_u_knode **ins; for (ins = &ht->ht[TC_U32_HASH(handle)]; *ins; ins = &(*ins)->next) diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index a18d975db59c..c297e2a8e2a1 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -493,20 +493,19 @@ void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc) } EXPORT_SYMBOL(qdisc_watchdog_init); -void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires) +void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires) { - ktime_t time; - if (test_bit(__QDISC_STATE_DEACTIVATED, &qdisc_root_sleeping(wd->qdisc)->state)) return; qdisc_throttled(wd->qdisc); - time = ktime_set(0, 0); - time = ktime_add_ns(time, PSCHED_TICKS2NS(expires)); - hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS); + + hrtimer_start(&wd->timer, + ns_to_ktime(expires), + HRTIMER_MODE_ABS); } -EXPORT_SYMBOL(qdisc_watchdog_schedule); +EXPORT_SYMBOL(qdisc_watchdog_schedule_ns); void qdisc_watchdog_cancel(struct qdisc_watchdog *wd) { @@ -546,7 +545,7 @@ static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n) void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash) { struct Qdisc_class_common *cl; - struct hlist_node *n, *next; + struct hlist_node *next; struct hlist_head *nhash, *ohash; unsigned int nsize, nmask, osize; unsigned int i, h; @@ -565,7 +564,7 @@ void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash) sch_tree_lock(sch); for (i = 0; i < osize; i++) { - hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) { + hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) { h = qdisc_class_hash(cl->classid, nmask); hlist_add_head(&cl->hnode, &nhash[h]); } @@ -834,6 +833,8 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue, goto err_out3; } lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock); + if (!netif_is_multiqueue(dev)) + sch->flags |= TCQ_F_ONETXQUEUE; } sch->handle = handle; @@ -981,6 +982,9 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) struct Qdisc *p = NULL; int err; + if ((n->nlmsg_type != RTM_GETQDISC) && !capable(CAP_NET_ADMIN)) + return -EPERM; + dev = __dev_get_by_index(net, tcm->tcm_ifindex); if (!dev) return -ENODEV; @@ -1044,6 +1048,9 @@ static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) struct Qdisc *q, *p; int err; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + replay: /* Reinit, just in case something touches this. */ tcm = nlmsg_data(n); @@ -1380,6 +1387,9 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg) u32 qid = TC_H_MAJ(clid); int err; + if ((n->nlmsg_type != RTM_GETTCLASS) && !capable(CAP_NET_ADMIN)) + return -EPERM; + dev = __dev_get_by_index(net, tcm->tcm_ifindex); if (!dev) return -ENODEV; @@ -1758,7 +1768,7 @@ static int __net_init psched_net_init(struct net *net) { struct proc_dir_entry *e; - e = proc_net_fops_create(net, "psched", 0, &psched_fops); + e = proc_create("psched", 0, net->proc_net, &psched_fops); if (e == NULL) return -ENOMEM; @@ -1767,7 +1777,7 @@ static int __net_init psched_net_init(struct net *net) static void __net_exit psched_net_exit(struct net *net) { - proc_net_remove(net, "psched"); + remove_proc_entry("psched", net->proc_net); } #else static int __net_init psched_net_init(struct net *net) diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 564b9fc8efd3..13aa47aa2ffb 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -509,8 +509,7 @@ static void cbq_ovl_delay(struct cbq_class *cl) cl->cpriority = TC_CBQ_MAXPRIO; q->pmask |= (1<<TC_CBQ_MAXPRIO); - expires = ktime_set(0, 0); - expires = ktime_add_ns(expires, PSCHED_TICKS2NS(sched)); + expires = ns_to_ktime(PSCHED_TICKS2NS(sched)); if (hrtimer_try_to_cancel(&q->delay_timer) && ktime_to_ns(ktime_sub( hrtimer_get_expires(&q->delay_timer), @@ -1042,14 +1041,13 @@ static void cbq_adjust_levels(struct cbq_class *this) static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio) { struct cbq_class *cl; - struct hlist_node *n; unsigned int h; if (q->quanta[prio] == 0) return; for (h = 0; h < q->clhash.hashsize; h++) { - hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) { + hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) { /* BUGGGG... Beware! This expression suffer of * arithmetic overflows! */ @@ -1088,10 +1086,9 @@ static void cbq_sync_defmap(struct cbq_class *cl) continue; for (h = 0; h < q->clhash.hashsize; h++) { - struct hlist_node *n; struct cbq_class *c; - hlist_for_each_entry(c, n, &q->clhash.hash[h], + hlist_for_each_entry(c, &q->clhash.hash[h], common.hnode) { if (c->split == split && c->level < level && c->defmap & (1<<i)) { @@ -1211,7 +1208,6 @@ cbq_reset(struct Qdisc *sch) { struct cbq_sched_data *q = qdisc_priv(sch); struct cbq_class *cl; - struct hlist_node *n; int prio; unsigned int h; @@ -1229,7 +1225,7 @@ cbq_reset(struct Qdisc *sch) q->active[prio] = NULL; for (h = 0; h < q->clhash.hashsize; h++) { - hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) { + hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) { qdisc_reset(cl->q); cl->next_alive = NULL; @@ -1698,7 +1694,7 @@ static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl) static void cbq_destroy(struct Qdisc *sch) { struct cbq_sched_data *q = qdisc_priv(sch); - struct hlist_node *n, *next; + struct hlist_node *next; struct cbq_class *cl; unsigned int h; @@ -1711,11 +1707,11 @@ static void cbq_destroy(struct Qdisc *sch) * be bound to classes which have been destroyed already. --TGR '04 */ for (h = 0; h < q->clhash.hashsize; h++) { - hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) + hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) tcf_destroy_chain(&cl->filter_list); } for (h = 0; h < q->clhash.hashsize; h++) { - hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[h], + hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h], common.hnode) cbq_destroy_class(sch, cl); } @@ -2014,14 +2010,13 @@ static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg) { struct cbq_sched_data *q = qdisc_priv(sch); struct cbq_class *cl; - struct hlist_node *n; unsigned int h; if (arg->stop) return; for (h = 0; h < q->clhash.hashsize; h++) { - hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) { + hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) { if (arg->count < arg->skip) { arg->count++; continue; diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index 71e50c80315f..759b308d1a8d 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -293,14 +293,13 @@ static void drr_walk(struct Qdisc *sch, struct qdisc_walker *arg) { struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl; - struct hlist_node *n; unsigned int i; if (arg->stop) return; for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) { + hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) { if (arg->count < arg->skip) { arg->count++; continue; @@ -451,11 +450,10 @@ static void drr_reset_qdisc(struct Qdisc *sch) { struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl; - struct hlist_node *n; unsigned int i; for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) { + hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) { if (cl->qdisc->q.qlen) list_del(&cl->alist); qdisc_reset(cl->qdisc); @@ -468,13 +466,13 @@ static void drr_destroy_qdisc(struct Qdisc *sch) { struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl; - struct hlist_node *n, *next; + struct hlist_node *next; unsigned int i; tcf_destroy_chain(&q->filter_list); for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[i], + hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i], common.hnode) drr_destroy_class(sch, cl); } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index aefc1504dc88..ffad48109a22 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -25,6 +25,7 @@ #include <linux/rcupdate.h> #include <linux/list.h> #include <linux/slab.h> +#include <net/sch_generic.h> #include <net/pkt_sched.h> #include <net/dst.h> @@ -53,20 +54,19 @@ static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) static inline struct sk_buff *dequeue_skb(struct Qdisc *q) { struct sk_buff *skb = q->gso_skb; + const struct netdev_queue *txq = q->dev_queue; if (unlikely(skb)) { - struct net_device *dev = qdisc_dev(q); - struct netdev_queue *txq; - /* check the reason of requeuing without tx lock first */ - txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); + txq = netdev_get_tx_queue(txq->dev, skb_get_queue_mapping(skb)); if (!netif_xmit_frozen_or_stopped(txq)) { q->gso_skb = NULL; q->q.qlen--; } else skb = NULL; } else { - skb = q->dequeue(q); + if (!(q->flags & TCQ_F_ONETXQUEUE) || !netif_xmit_frozen_or_stopped(txq)) + skb = q->dequeue(q); } return skb; @@ -686,6 +686,8 @@ static void attach_one_default_qdisc(struct net_device *dev, netdev_info(dev, "activation failed\n"); return; } + if (!netif_is_multiqueue(dev)) + qdisc->flags |= TCQ_F_ONETXQUEUE; } dev_queue->qdisc_sleeping = qdisc; } @@ -895,3 +897,39 @@ void dev_shutdown(struct net_device *dev) WARN_ON(timer_pending(&dev->watchdog_timer)); } + +void psched_ratecfg_precompute(struct psched_ratecfg *r, u32 rate) +{ + u64 factor; + u64 mult; + int shift; + + r->rate_bps = rate << 3; + r->shift = 0; + r->mult = 1; + /* + * Calibrate mult, shift so that token counting is accurate + * for smallest packet size (64 bytes). Token (time in ns) is + * computed as (bytes * 8) * NSEC_PER_SEC / rate_bps. It will + * work as long as the smallest packet transfer time can be + * accurately represented in nanosec. + */ + if (r->rate_bps > 0) { + /* + * Higher shift gives better accuracy. Find the largest + * shift such that mult fits in 32 bits. + */ + for (shift = 0; shift < 16; shift++) { + r->shift = shift; + factor = 8LLU * NSEC_PER_SEC * (1 << r->shift); + mult = div64_u64(factor, r->rate_bps); + if (mult > UINT_MAX) + break; + } + + r->shift = shift - 1; + factor = 8LLU * NSEC_PER_SEC * (1 << r->shift); + r->mult = div64_u64(factor, r->rate_bps); + } +} +EXPORT_SYMBOL(psched_ratecfg_precompute); diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 6c2ec4510540..9facea03faeb 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1389,7 +1389,6 @@ static void hfsc_walk(struct Qdisc *sch, struct qdisc_walker *arg) { struct hfsc_sched *q = qdisc_priv(sch); - struct hlist_node *n; struct hfsc_class *cl; unsigned int i; @@ -1397,7 +1396,7 @@ hfsc_walk(struct Qdisc *sch, struct qdisc_walker *arg) return; for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry(cl, n, &q->clhash.hash[i], + hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode) { if (arg->count < arg->skip) { arg->count++; @@ -1523,11 +1522,10 @@ hfsc_reset_qdisc(struct Qdisc *sch) { struct hfsc_sched *q = qdisc_priv(sch); struct hfsc_class *cl; - struct hlist_node *n; unsigned int i; for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry(cl, n, &q->clhash.hash[i], cl_common.hnode) + hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode) hfsc_reset_class(cl); } q->eligible = RB_ROOT; @@ -1540,16 +1538,16 @@ static void hfsc_destroy_qdisc(struct Qdisc *sch) { struct hfsc_sched *q = qdisc_priv(sch); - struct hlist_node *n, *next; + struct hlist_node *next; struct hfsc_class *cl; unsigned int i; for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry(cl, n, &q->clhash.hash[i], cl_common.hnode) + hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode) tcf_destroy_chain(&cl->filter_list); } for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[i], + hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i], cl_common.hnode) hfsc_destroy_class(sch, cl); } @@ -1564,12 +1562,11 @@ hfsc_dump_qdisc(struct Qdisc *sch, struct sk_buff *skb) unsigned char *b = skb_tail_pointer(skb); struct tc_hfsc_qopt qopt; struct hfsc_class *cl; - struct hlist_node *n; unsigned int i; sch->qstats.backlog = 0; for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry(cl, n, &q->clhash.hash[i], cl_common.hnode) + hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode) sch->qstats.backlog += cl->qdisc->qstats.backlog; } diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 9d75b7761313..571f1d211f4d 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -38,6 +38,7 @@ #include <linux/workqueue.h> #include <linux/slab.h> #include <net/netlink.h> +#include <net/sch_generic.h> #include <net/pkt_sched.h> /* HTB algorithm. @@ -118,11 +119,11 @@ struct htb_class { int filter_cnt; /* token bucket parameters */ - struct qdisc_rate_table *rate; /* rate table of the class itself */ - struct qdisc_rate_table *ceil; /* ceiling rate (limits borrows too) */ - long buffer, cbuffer; /* token bucket depth/rate */ + struct psched_ratecfg rate; + struct psched_ratecfg ceil; + s64 buffer, cbuffer; /* token bucket depth/rate */ psched_tdiff_t mbuffer; /* max wait time */ - long tokens, ctokens; /* current number of tokens */ + s64 tokens, ctokens; /* current number of tokens */ psched_time_t t_c; /* checkpoint time */ }; @@ -273,7 +274,7 @@ static void htb_add_to_id_tree(struct rb_root *root, * already in the queue. */ static void htb_add_to_wait_tree(struct htb_sched *q, - struct htb_class *cl, long delay) + struct htb_class *cl, s64 delay) { struct rb_node **p = &q->wait_pq[cl->level].rb_node, *parent = NULL; @@ -441,14 +442,14 @@ static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl) htb_remove_class_from_row(q, cl, mask); } -static inline long htb_lowater(const struct htb_class *cl) +static inline s64 htb_lowater(const struct htb_class *cl) { if (htb_hysteresis) return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0; else return 0; } -static inline long htb_hiwater(const struct htb_class *cl) +static inline s64 htb_hiwater(const struct htb_class *cl) { if (htb_hysteresis) return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0; @@ -469,9 +470,9 @@ static inline long htb_hiwater(const struct htb_class *cl) * mode transitions per time unit. The speed gain is about 1/6. */ static inline enum htb_cmode -htb_class_mode(struct htb_class *cl, long *diff) +htb_class_mode(struct htb_class *cl, s64 *diff) { - long toks; + s64 toks; if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) { *diff = -toks; @@ -495,7 +496,7 @@ htb_class_mode(struct htb_class *cl, long *diff) * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree). */ static void -htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, long *diff) +htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff) { enum htb_cmode new_mode = htb_class_mode(cl, diff); @@ -581,26 +582,26 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch) return NET_XMIT_SUCCESS; } -static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, long diff) +static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, s64 diff) { - long toks = diff + cl->tokens; + s64 toks = diff + cl->tokens; if (toks > cl->buffer) toks = cl->buffer; - toks -= (long) qdisc_l2t(cl->rate, bytes); + toks -= (s64) psched_l2t_ns(&cl->rate, bytes); if (toks <= -cl->mbuffer) toks = 1 - cl->mbuffer; cl->tokens = toks; } -static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, long diff) +static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, s64 diff) { - long toks = diff + cl->ctokens; + s64 toks = diff + cl->ctokens; if (toks > cl->cbuffer) toks = cl->cbuffer; - toks -= (long) qdisc_l2t(cl->ceil, bytes); + toks -= (s64) psched_l2t_ns(&cl->ceil, bytes); if (toks <= -cl->mbuffer) toks = 1 - cl->mbuffer; @@ -623,10 +624,10 @@ static void htb_charge_class(struct htb_sched *q, struct htb_class *cl, { int bytes = qdisc_pkt_len(skb); enum htb_cmode old_mode; - long diff; + s64 diff; while (cl) { - diff = psched_tdiff_bounded(q->now, cl->t_c, cl->mbuffer); + diff = min_t(s64, q->now - cl->t_c, cl->mbuffer); if (cl->level >= level) { if (cl->level == level) cl->xstats.lends++; @@ -673,7 +674,7 @@ static psched_time_t htb_do_events(struct htb_sched *q, int level, unsigned long stop_at = start + 2; while (time_before(jiffies, stop_at)) { struct htb_class *cl; - long diff; + s64 diff; struct rb_node *p = rb_first(&q->wait_pq[level]); if (!p) @@ -684,7 +685,7 @@ static psched_time_t htb_do_events(struct htb_sched *q, int level, return cl->pq_key; htb_safe_rb_erase(p, q->wait_pq + level); - diff = psched_tdiff_bounded(q->now, cl->t_c, cl->mbuffer); + diff = min_t(s64, q->now - cl->t_c, cl->mbuffer); htb_change_class_mode(q, cl, &diff); if (cl->cmode != HTB_CAN_SEND) htb_add_to_wait_tree(q, cl, diff); @@ -871,10 +872,10 @@ ok: if (!sch->q.qlen) goto fin; - q->now = psched_get_time(); + q->now = ktime_to_ns(ktime_get()); start_at = jiffies; - next_event = q->now + 5 * PSCHED_TICKS_PER_SEC; + next_event = q->now + 5LLU * NSEC_PER_SEC; for (level = 0; level < TC_HTB_MAXDEPTH; level++) { /* common case optimization - skip event handler quickly */ @@ -884,7 +885,7 @@ ok: if (q->now >= q->near_ev_cache[level]) { event = htb_do_events(q, level, start_at); if (!event) - event = q->now + PSCHED_TICKS_PER_SEC; + event = q->now + NSEC_PER_SEC; q->near_ev_cache[level] = event; } else event = q->near_ev_cache[level]; @@ -903,10 +904,17 @@ ok: } } sch->qstats.overlimits++; - if (likely(next_event > q->now)) - qdisc_watchdog_schedule(&q->watchdog, next_event); - else + if (likely(next_event > q->now)) { + if (!test_bit(__QDISC_STATE_DEACTIVATED, + &qdisc_root_sleeping(q->watchdog.qdisc)->state)) { + ktime_t time = ns_to_ktime(next_event); + qdisc_throttled(q->watchdog.qdisc); + hrtimer_start(&q->watchdog.timer, time, + HRTIMER_MODE_ABS); + } + } else { schedule_work(&q->work); + } fin: return skb; } @@ -941,11 +949,10 @@ static void htb_reset(struct Qdisc *sch) { struct htb_sched *q = qdisc_priv(sch); struct htb_class *cl; - struct hlist_node *n; unsigned int i; for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) { + hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) { if (cl->level) memset(&cl->un.inner, 0, sizeof(cl->un.inner)); else { @@ -1082,10 +1089,10 @@ static int htb_dump_class(struct Qdisc *sch, unsigned long arg, memset(&opt, 0, sizeof(opt)); - opt.rate = cl->rate->rate; - opt.buffer = cl->buffer; - opt.ceil = cl->ceil->rate; - opt.cbuffer = cl->cbuffer; + opt.rate.rate = psched_ratecfg_getrate(&cl->rate); + opt.buffer = PSCHED_NS2TICKS(cl->buffer); + opt.ceil.rate = psched_ratecfg_getrate(&cl->ceil); + opt.cbuffer = PSCHED_NS2TICKS(cl->cbuffer); opt.quantum = cl->quantum; opt.prio = cl->prio; opt.level = cl->level; @@ -1203,9 +1210,6 @@ static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl) qdisc_destroy(cl->un.leaf.q); } gen_kill_estimator(&cl->bstats, &cl->rate_est); - qdisc_put_rtab(cl->rate); - qdisc_put_rtab(cl->ceil); - tcf_destroy_chain(&cl->filter_list); kfree(cl); } @@ -1213,7 +1217,7 @@ static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl) static void htb_destroy(struct Qdisc *sch) { struct htb_sched *q = qdisc_priv(sch); - struct hlist_node *n, *next; + struct hlist_node *next; struct htb_class *cl; unsigned int i; @@ -1227,11 +1231,11 @@ static void htb_destroy(struct Qdisc *sch) tcf_destroy_chain(&q->filter_list); for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) + hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) tcf_destroy_chain(&cl->filter_list); } for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[i], + hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i], common.hnode) htb_destroy_class(sch, cl); } @@ -1307,7 +1311,6 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, struct htb_sched *q = qdisc_priv(sch); struct htb_class *cl = (struct htb_class *)*arg, *parent; struct nlattr *opt = tca[TCA_OPTIONS]; - struct qdisc_rate_table *rtab = NULL, *ctab = NULL; struct nlattr *tb[__TCA_HTB_MAX]; struct tc_htb_opt *hopt; @@ -1326,10 +1329,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch); hopt = nla_data(tb[TCA_HTB_PARMS]); - - rtab = qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB]); - ctab = qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB]); - if (!rtab || !ctab) + if (!hopt->rate.rate || !hopt->ceil.rate) goto failure; if (!cl) { /* new class */ @@ -1414,8 +1414,8 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, cl->parent = parent; /* set class to be in HTB_CAN_SEND state */ - cl->tokens = hopt->buffer; - cl->ctokens = hopt->cbuffer; + cl->tokens = PSCHED_TICKS2NS(hopt->buffer); + cl->ctokens = PSCHED_TICKS2NS(hopt->cbuffer); cl->mbuffer = 60 * PSCHED_TICKS_PER_SEC; /* 1min */ cl->t_c = psched_get_time(); cl->cmode = HTB_CAN_SEND; @@ -1439,7 +1439,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, * is really leaf before changing cl->un.leaf ! */ if (!cl->level) { - cl->quantum = rtab->rate.rate / q->rate2quantum; + cl->quantum = hopt->rate.rate / q->rate2quantum; if (!hopt->quantum && cl->quantum < 1000) { pr_warning( "HTB: quantum of class %X is small. Consider r2q change.\n", @@ -1458,14 +1458,12 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, cl->prio = TC_HTB_NUMPRIO - 1; } - cl->buffer = hopt->buffer; - cl->cbuffer = hopt->cbuffer; - if (cl->rate) - qdisc_put_rtab(cl->rate); - cl->rate = rtab; - if (cl->ceil) - qdisc_put_rtab(cl->ceil); - cl->ceil = ctab; + psched_ratecfg_precompute(&cl->rate, hopt->rate.rate); + psched_ratecfg_precompute(&cl->ceil, hopt->ceil.rate); + + cl->buffer = PSCHED_TICKS2NS(hopt->buffer); + cl->cbuffer = PSCHED_TICKS2NS(hopt->buffer); + sch_tree_unlock(sch); qdisc_class_hash_grow(sch, &q->clhash); @@ -1474,10 +1472,6 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, return 0; failure: - if (rtab) - qdisc_put_rtab(rtab); - if (ctab) - qdisc_put_rtab(ctab); return err; } @@ -1521,14 +1515,13 @@ static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg) { struct htb_sched *q = qdisc_priv(sch); struct htb_class *cl; - struct hlist_node *n; unsigned int i; if (arg->stop) return; for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) { + hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) { if (arg->count < arg->skip) { arg->count++; continue; diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index 0a4b2f9a0094..5da78a19ac9a 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -63,6 +63,7 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt) if (qdisc == NULL) goto err; priv->qdiscs[ntx] = qdisc; + qdisc->flags |= TCQ_F_ONETXQUEUE; } sch->flags |= TCQ_F_MQROOT; @@ -150,7 +151,8 @@ static int mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, dev_deactivate(dev); *old = dev_graft_qdisc(dev_queue, new); - + if (new) + new->flags |= TCQ_F_ONETXQUEUE; if (dev->flags & IFF_UP) dev_activate(dev); return 0; diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index d1831ca966d4..accec33c454c 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -132,6 +132,7 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) goto err; } priv->qdiscs[i] = qdisc; + qdisc->flags |= TCQ_F_ONETXQUEUE; } /* If the mqprio options indicate that hardware should own @@ -205,6 +206,9 @@ static int mqprio_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, *old = dev_graft_qdisc(dev_queue, new); + if (new) + new->flags |= TCQ_F_ONETXQUEUE; + if (dev->flags & IFF_UP) dev_activate(dev); diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 298c0ddfb57e..3d2acc7a9c80 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -438,18 +438,18 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (q->rate) { struct sk_buff_head *list = &sch->q; - delay += packet_len_2_sched_time(skb->len, q); - if (!skb_queue_empty(list)) { /* - * Last packet in queue is reference point (now). - * First packet in queue is already in flight, - * calculate this time bonus and substract + * Last packet in queue is reference point (now), + * calculate this time bonus and subtract * from delay. */ - delay -= now - netem_skb_cb(skb_peek(list))->time_to_send; + delay -= netem_skb_cb(skb_peek_tail(list))->time_to_send - now; + delay = max_t(psched_tdiff_t, 0, delay); now = netem_skb_cb(skb_peek_tail(list))->time_to_send; } + + delay += packet_len_2_sched_time(skb->len, q); } cb->time_to_send = now + delay; diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index f0dd83cff906..d51852bba01c 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -1,7 +1,8 @@ /* - * net/sched/sch_qfq.c Quick Fair Queueing Scheduler. + * net/sched/sch_qfq.c Quick Fair Queueing Plus Scheduler. * * Copyright (c) 2009 Fabio Checconi, Luigi Rizzo, and Paolo Valente. + * Copyright (c) 2012 Paolo Valente. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -19,12 +20,18 @@ #include <net/pkt_cls.h> -/* Quick Fair Queueing - =================== +/* Quick Fair Queueing Plus + ======================== Sources: - Fabio Checconi, Luigi Rizzo, and Paolo Valente: "QFQ: Efficient + [1] Paolo Valente, + "Reducing the Execution Time of Fair-Queueing Schedulers." + http://algo.ing.unimo.it/people/paolo/agg-sched/agg-sched.pdf + + Sources for QFQ: + + [2] Fabio Checconi, Luigi Rizzo, and Paolo Valente: "QFQ: Efficient Packet Scheduling with Tight Bandwidth Distribution Guarantees." See also: @@ -33,6 +40,20 @@ /* + QFQ+ divides classes into aggregates of at most MAX_AGG_CLASSES + classes. Each aggregate is timestamped with a virtual start time S + and a virtual finish time F, and scheduled according to its + timestamps. S and F are computed as a function of a system virtual + time function V. The classes within each aggregate are instead + scheduled with DRR. + + To speed up operations, QFQ+ divides also aggregates into a limited + number of groups. Which group a class belongs to depends on the + ratio between the maximum packet length for the class and the weight + of the class. Groups have their own S and F. In the end, QFQ+ + schedules groups, then aggregates within groups, then classes within + aggregates. See [1] and [2] for a full description. + Virtual time computations. S, F and V are all computed in fixed point arithmetic with @@ -76,26 +97,28 @@ #define QFQ_MAX_SLOTS 32 /* - * Shifts used for class<->group mapping. We allow class weights that are - * in the range [1, 2^MAX_WSHIFT], and we try to map each class i to the + * Shifts used for aggregate<->group mapping. We allow class weights that are + * in the range [1, 2^MAX_WSHIFT], and we try to map each aggregate i to the * group with the smallest index that can support the L_i / r_i configured - * for the class. + * for the classes in the aggregate. * * grp->index is the index of the group; and grp->slot_shift * is the shift for the corresponding (scaled) sigma_i. */ -#define QFQ_MAX_INDEX 19 -#define QFQ_MAX_WSHIFT 16 +#define QFQ_MAX_INDEX 24 +#define QFQ_MAX_WSHIFT 10 -#define QFQ_MAX_WEIGHT (1<<QFQ_MAX_WSHIFT) -#define QFQ_MAX_WSUM (2*QFQ_MAX_WEIGHT) +#define QFQ_MAX_WEIGHT (1<<QFQ_MAX_WSHIFT) /* see qfq_slot_insert */ +#define QFQ_MAX_WSUM (64*QFQ_MAX_WEIGHT) #define FRAC_BITS 30 /* fixed point arithmetic */ #define ONE_FP (1UL << FRAC_BITS) #define IWSUM (ONE_FP/QFQ_MAX_WSUM) -#define QFQ_MTU_SHIFT 11 -#define QFQ_MIN_SLOT_SHIFT (FRAC_BITS + QFQ_MTU_SHIFT - QFQ_MAX_INDEX) +#define QFQ_MTU_SHIFT 16 /* to support TSO/GSO */ +#define QFQ_MIN_LMAX 512 /* see qfq_slot_insert */ + +#define QFQ_MAX_AGG_CLASSES 8 /* max num classes per aggregate allowed */ /* * Possible group states. These values are used as indexes for the bitmaps @@ -105,6 +128,8 @@ enum qfq_state { ER, IR, EB, IB, QFQ_MAX_STATE }; struct qfq_group; +struct qfq_aggregate; + struct qfq_class { struct Qdisc_class_common common; @@ -115,7 +140,12 @@ struct qfq_class { struct gnet_stats_queue qstats; struct gnet_stats_rate_est rate_est; struct Qdisc *qdisc; + struct list_head alist; /* Link for active-classes list. */ + struct qfq_aggregate *agg; /* Parent aggregate. */ + int deficit; /* DRR deficit counter. */ +}; +struct qfq_aggregate { struct hlist_node next; /* Link for the slot list. */ u64 S, F; /* flow timestamps (exact) */ @@ -126,8 +156,18 @@ struct qfq_class { struct qfq_group *grp; /* these are copied from the flowset. */ - u32 inv_w; /* ONE_FP/weight */ - u32 lmax; /* Max packet size for this flow. */ + u32 class_weight; /* Weight of each class in this aggregate. */ + /* Max pkt size for the classes in this aggregate, DRR quantum. */ + int lmax; + + u32 inv_w; /* ONE_FP/(sum of weights of classes in aggr.). */ + u32 budgetmax; /* Max budget for this aggregate. */ + u32 initial_budget, budget; /* Initial and current budget. */ + + int num_classes; /* Number of classes in this aggr. */ + struct list_head active; /* DRR queue of active classes. */ + + struct hlist_node nonfull_next; /* See nonfull_aggs in qfq_sched. */ }; struct qfq_group { @@ -137,7 +177,7 @@ struct qfq_group { unsigned int front; /* Index of the front slot. */ unsigned long full_slots; /* non-empty slots */ - /* Array of RR lists of active classes. */ + /* Array of RR lists of active aggregates. */ struct hlist_head slots[QFQ_MAX_SLOTS]; }; @@ -145,13 +185,28 @@ struct qfq_sched { struct tcf_proto *filter_list; struct Qdisc_class_hash clhash; - u64 V; /* Precise virtual time. */ - u32 wsum; /* weight sum */ + u64 oldV, V; /* Precise virtual times. */ + struct qfq_aggregate *in_serv_agg; /* Aggregate being served. */ + u32 num_active_agg; /* Num. of active aggregates */ + u32 wsum; /* weight sum */ unsigned long bitmaps[QFQ_MAX_STATE]; /* Group bitmaps. */ struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */ + u32 min_slot_shift; /* Index of the group-0 bit in the bitmaps. */ + + u32 max_agg_classes; /* Max number of classes per aggr. */ + struct hlist_head nonfull_aggs; /* Aggs with room for more classes. */ }; +/* + * Possible reasons why the timestamps of an aggregate are updated + * enqueue: the aggregate switches from idle to active and must scheduled + * for service + * requeue: the aggregate finishes its budget, so it stops being served and + * must be rescheduled for service + */ +enum update_reason {enqueue, requeue}; + static struct qfq_class *qfq_find_class(struct Qdisc *sch, u32 classid) { struct qfq_sched *q = qdisc_priv(sch); @@ -181,18 +236,18 @@ static const struct nla_policy qfq_policy[TCA_QFQ_MAX + 1] = { * index = log_2(maxlen/weight) but we need to apply the scaling. * This is used only once at flow creation. */ -static int qfq_calc_index(u32 inv_w, unsigned int maxlen) +static int qfq_calc_index(u32 inv_w, unsigned int maxlen, u32 min_slot_shift) { u64 slot_size = (u64)maxlen * inv_w; unsigned long size_map; int index = 0; - size_map = slot_size >> QFQ_MIN_SLOT_SHIFT; + size_map = slot_size >> min_slot_shift; if (!size_map) goto out; index = __fls(size_map) + 1; /* basically a log_2 */ - index -= !(slot_size - (1ULL << (index + QFQ_MIN_SLOT_SHIFT - 1))); + index -= !(slot_size - (1ULL << (index + min_slot_shift - 1))); if (index < 0) index = 0; @@ -203,32 +258,143 @@ out: return index; } -/* Length of the next packet (0 if the queue is empty). */ -static unsigned int qdisc_peek_len(struct Qdisc *sch) +static void qfq_deactivate_agg(struct qfq_sched *, struct qfq_aggregate *); +static void qfq_activate_agg(struct qfq_sched *, struct qfq_aggregate *, + enum update_reason); + +static void qfq_init_agg(struct qfq_sched *q, struct qfq_aggregate *agg, + u32 lmax, u32 weight) { - struct sk_buff *skb; + INIT_LIST_HEAD(&agg->active); + hlist_add_head(&agg->nonfull_next, &q->nonfull_aggs); + + agg->lmax = lmax; + agg->class_weight = weight; +} + +static struct qfq_aggregate *qfq_find_agg(struct qfq_sched *q, + u32 lmax, u32 weight) +{ + struct qfq_aggregate *agg; + + hlist_for_each_entry(agg, &q->nonfull_aggs, nonfull_next) + if (agg->lmax == lmax && agg->class_weight == weight) + return agg; + + return NULL; +} + + +/* Update aggregate as a function of the new number of classes. */ +static void qfq_update_agg(struct qfq_sched *q, struct qfq_aggregate *agg, + int new_num_classes) +{ + u32 new_agg_weight; + + if (new_num_classes == q->max_agg_classes) + hlist_del_init(&agg->nonfull_next); - skb = sch->ops->peek(sch); - return skb ? qdisc_pkt_len(skb) : 0; + if (agg->num_classes > new_num_classes && + new_num_classes == q->max_agg_classes - 1) /* agg no more full */ + hlist_add_head(&agg->nonfull_next, &q->nonfull_aggs); + + /* The next assignment may let + * agg->initial_budget > agg->budgetmax + * hold, we will take it into account in charge_actual_service(). + */ + agg->budgetmax = new_num_classes * agg->lmax; + new_agg_weight = agg->class_weight * new_num_classes; + agg->inv_w = ONE_FP/new_agg_weight; + + if (agg->grp == NULL) { + int i = qfq_calc_index(agg->inv_w, agg->budgetmax, + q->min_slot_shift); + agg->grp = &q->groups[i]; + } + + q->wsum += + (int) agg->class_weight * (new_num_classes - agg->num_classes); + + agg->num_classes = new_num_classes; +} + +/* Add class to aggregate. */ +static void qfq_add_to_agg(struct qfq_sched *q, + struct qfq_aggregate *agg, + struct qfq_class *cl) +{ + cl->agg = agg; + + qfq_update_agg(q, agg, agg->num_classes+1); + if (cl->qdisc->q.qlen > 0) { /* adding an active class */ + list_add_tail(&cl->alist, &agg->active); + if (list_first_entry(&agg->active, struct qfq_class, alist) == + cl && q->in_serv_agg != agg) /* agg was inactive */ + qfq_activate_agg(q, agg, enqueue); /* schedule agg */ + } } -static void qfq_deactivate_class(struct qfq_sched *, struct qfq_class *); -static void qfq_activate_class(struct qfq_sched *q, struct qfq_class *cl, - unsigned int len); +static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *); + +static void qfq_destroy_agg(struct qfq_sched *q, struct qfq_aggregate *agg) +{ + if (!hlist_unhashed(&agg->nonfull_next)) + hlist_del_init(&agg->nonfull_next); + if (q->in_serv_agg == agg) + q->in_serv_agg = qfq_choose_next_agg(q); + kfree(agg); +} + +/* Deschedule class from within its parent aggregate. */ +static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl) +{ + struct qfq_aggregate *agg = cl->agg; + + + list_del(&cl->alist); /* remove from RR queue of the aggregate */ + if (list_empty(&agg->active)) /* agg is now inactive */ + qfq_deactivate_agg(q, agg); +} -static void qfq_update_class_params(struct qfq_sched *q, struct qfq_class *cl, - u32 lmax, u32 inv_w, int delta_w) +/* Remove class from its parent aggregate. */ +static void qfq_rm_from_agg(struct qfq_sched *q, struct qfq_class *cl) { - int i; + struct qfq_aggregate *agg = cl->agg; - /* update qfq-specific data */ - cl->lmax = lmax; - cl->inv_w = inv_w; - i = qfq_calc_index(cl->inv_w, cl->lmax); + cl->agg = NULL; + if (agg->num_classes == 1) { /* agg being emptied, destroy it */ + qfq_destroy_agg(q, agg); + return; + } + qfq_update_agg(q, agg, agg->num_classes-1); +} - cl->grp = &q->groups[i]; +/* Deschedule class and remove it from its parent aggregate. */ +static void qfq_deact_rm_from_agg(struct qfq_sched *q, struct qfq_class *cl) +{ + if (cl->qdisc->q.qlen > 0) /* class is active */ + qfq_deactivate_class(q, cl); - q->wsum += delta_w; + qfq_rm_from_agg(q, cl); +} + +/* Move class to a new aggregate, matching the new class weight and/or lmax */ +static int qfq_change_agg(struct Qdisc *sch, struct qfq_class *cl, u32 weight, + u32 lmax) +{ + struct qfq_sched *q = qdisc_priv(sch); + struct qfq_aggregate *new_agg = qfq_find_agg(q, lmax, weight); + + if (new_agg == NULL) { /* create new aggregate */ + new_agg = kzalloc(sizeof(*new_agg), GFP_ATOMIC); + if (new_agg == NULL) + return -ENOBUFS; + qfq_init_agg(q, new_agg, lmax, weight); + } + qfq_deact_rm_from_agg(q, cl); + qfq_add_to_agg(q, new_agg, cl); + + return 0; } static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, @@ -236,9 +402,11 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, { struct qfq_sched *q = qdisc_priv(sch); struct qfq_class *cl = (struct qfq_class *)*arg; + bool existing = false; struct nlattr *tb[TCA_QFQ_MAX + 1]; + struct qfq_aggregate *new_agg = NULL; u32 weight, lmax, inv_w; - int i, err; + int err; int delta_w; if (tca[TCA_OPTIONS] == NULL) { @@ -259,27 +427,32 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, } else weight = 1; - inv_w = ONE_FP / weight; - weight = ONE_FP / inv_w; - delta_w = weight - (cl ? ONE_FP / cl->inv_w : 0); - if (q->wsum + delta_w > QFQ_MAX_WSUM) { - pr_notice("qfq: total weight out of range (%u + %u)\n", - delta_w, q->wsum); - return -EINVAL; - } - if (tb[TCA_QFQ_LMAX]) { lmax = nla_get_u32(tb[TCA_QFQ_LMAX]); - if (!lmax || lmax > (1UL << QFQ_MTU_SHIFT)) { + if (lmax < QFQ_MIN_LMAX || lmax > (1UL << QFQ_MTU_SHIFT)) { pr_notice("qfq: invalid max length %u\n", lmax); return -EINVAL; } } else - lmax = 1UL << QFQ_MTU_SHIFT; + lmax = psched_mtu(qdisc_dev(sch)); + + inv_w = ONE_FP / weight; + weight = ONE_FP / inv_w; + + if (cl != NULL && + lmax == cl->agg->lmax && + weight == cl->agg->class_weight) + return 0; /* nothing to change */ - if (cl != NULL) { - bool need_reactivation = false; + delta_w = weight - (cl ? cl->agg->class_weight : 0); + if (q->wsum + delta_w > QFQ_MAX_WSUM) { + pr_notice("qfq: total weight out of range (%d + %u)\n", + delta_w, q->wsum); + return -EINVAL; + } + + if (cl != NULL) { /* modify existing class */ if (tca[TCA_RATE]) { err = gen_replace_estimator(&cl->bstats, &cl->rate_est, qdisc_root_sleeping_lock(sch), @@ -287,41 +460,18 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (err) return err; } - - if (lmax == cl->lmax && inv_w == cl->inv_w) - return 0; /* nothing to update */ - - i = qfq_calc_index(inv_w, lmax); - sch_tree_lock(sch); - if (&q->groups[i] != cl->grp && cl->qdisc->q.qlen > 0) { - /* - * shift cl->F back, to not charge the - * class for the not-yet-served head - * packet - */ - cl->F = cl->S; - /* remove class from its slot in the old group */ - qfq_deactivate_class(q, cl); - need_reactivation = true; - } - - qfq_update_class_params(q, cl, lmax, inv_w, delta_w); - - if (need_reactivation) /* activate in new group */ - qfq_activate_class(q, cl, qdisc_peek_len(cl->qdisc)); - sch_tree_unlock(sch); - - return 0; + existing = true; + goto set_change_agg; } + /* create and init new class */ cl = kzalloc(sizeof(struct qfq_class), GFP_KERNEL); if (cl == NULL) return -ENOBUFS; cl->refcnt = 1; cl->common.classid = classid; - - qfq_update_class_params(q, cl, lmax, inv_w, delta_w); + cl->deficit = lmax; cl->qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid); @@ -332,11 +482,8 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, err = gen_new_estimator(&cl->bstats, &cl->rate_est, qdisc_root_sleeping_lock(sch), tca[TCA_RATE]); - if (err) { - qdisc_destroy(cl->qdisc); - kfree(cl); - return err; - } + if (err) + goto destroy_class; } sch_tree_lock(sch); @@ -345,19 +492,39 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, qdisc_class_hash_grow(sch, &q->clhash); +set_change_agg: + sch_tree_lock(sch); + new_agg = qfq_find_agg(q, lmax, weight); + if (new_agg == NULL) { /* create new aggregate */ + sch_tree_unlock(sch); + new_agg = kzalloc(sizeof(*new_agg), GFP_KERNEL); + if (new_agg == NULL) { + err = -ENOBUFS; + gen_kill_estimator(&cl->bstats, &cl->rate_est); + goto destroy_class; + } + sch_tree_lock(sch); + qfq_init_agg(q, new_agg, lmax, weight); + } + if (existing) + qfq_deact_rm_from_agg(q, cl); + qfq_add_to_agg(q, new_agg, cl); + sch_tree_unlock(sch); + *arg = (unsigned long)cl; return 0; + +destroy_class: + qdisc_destroy(cl->qdisc); + kfree(cl); + return err; } static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl) { struct qfq_sched *q = qdisc_priv(sch); - if (cl->inv_w) { - q->wsum -= ONE_FP / cl->inv_w; - cl->inv_w = 0; - } - + qfq_rm_from_agg(q, cl); gen_kill_estimator(&cl->bstats, &cl->rate_est); qdisc_destroy(cl->qdisc); kfree(cl); @@ -472,8 +639,8 @@ static int qfq_dump_class(struct Qdisc *sch, unsigned long arg, nest = nla_nest_start(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; - if (nla_put_u32(skb, TCA_QFQ_WEIGHT, ONE_FP/cl->inv_w) || - nla_put_u32(skb, TCA_QFQ_LMAX, cl->lmax)) + if (nla_put_u32(skb, TCA_QFQ_WEIGHT, cl->agg->class_weight) || + nla_put_u32(skb, TCA_QFQ_LMAX, cl->agg->lmax)) goto nla_put_failure; return nla_nest_end(skb, nest); @@ -491,8 +658,8 @@ static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg, memset(&xstats, 0, sizeof(xstats)); cl->qdisc->qstats.qlen = cl->qdisc->q.qlen; - xstats.weight = ONE_FP/cl->inv_w; - xstats.lmax = cl->lmax; + xstats.weight = cl->agg->class_weight; + xstats.lmax = cl->agg->lmax; if (gnet_stats_copy_basic(d, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || @@ -506,14 +673,13 @@ static void qfq_walk(struct Qdisc *sch, struct qdisc_walker *arg) { struct qfq_sched *q = qdisc_priv(sch); struct qfq_class *cl; - struct hlist_node *n; unsigned int i; if (arg->stop) return; for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) { + hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) { if (arg->count < arg->skip) { arg->count++; continue; @@ -643,19 +809,19 @@ static void qfq_unblock_groups(struct qfq_sched *q, int index, u64 old_F) * perhaps * old_V ^= q->V; - old_V >>= QFQ_MIN_SLOT_SHIFT; + old_V >>= q->min_slot_shift; if (old_V) { ... } * */ -static void qfq_make_eligible(struct qfq_sched *q, u64 old_V) +static void qfq_make_eligible(struct qfq_sched *q) { - unsigned long vslot = q->V >> QFQ_MIN_SLOT_SHIFT; - unsigned long old_vslot = old_V >> QFQ_MIN_SLOT_SHIFT; + unsigned long vslot = q->V >> q->min_slot_shift; + unsigned long old_vslot = q->oldV >> q->min_slot_shift; if (vslot != old_vslot) { - unsigned long mask = (1UL << fls(vslot ^ old_vslot)) - 1; + unsigned long mask = (1ULL << fls(vslot ^ old_vslot)) - 1; qfq_move_groups(q, mask, IR, ER); qfq_move_groups(q, mask, IB, EB); } @@ -663,25 +829,62 @@ static void qfq_make_eligible(struct qfq_sched *q, u64 old_V) /* - * XXX we should make sure that slot becomes less than 32. - * This is guaranteed by the input values. - * roundedS is always cl->S rounded on grp->slot_shift bits. + * The index of the slot in which the aggregate is to be inserted must + * not be higher than QFQ_MAX_SLOTS-2. There is a '-2' and not a '-1' + * because the start time of the group may be moved backward by one + * slot after the aggregate has been inserted, and this would cause + * non-empty slots to be right-shifted by one position. + * + * If the weight and lmax (max_pkt_size) of the classes do not change, + * then QFQ+ does meet the above contraint according to the current + * values of its parameters. In fact, if the weight and lmax of the + * classes do not change, then, from the theory, QFQ+ guarantees that + * the slot index is never higher than + * 2 + QFQ_MAX_AGG_CLASSES * ((1<<QFQ_MTU_SHIFT)/QFQ_MIN_LMAX) * + * (QFQ_MAX_WEIGHT/QFQ_MAX_WSUM) = 2 + 8 * 128 * (1 / 64) = 18 + * + * When the weight of a class is increased or the lmax of the class is + * decreased, a new aggregate with smaller slot size than the original + * parent aggregate of the class may happen to be activated. The + * activation of this aggregate should be properly delayed to when the + * service of the class has finished in the ideal system tracked by + * QFQ+. If the activation of the aggregate is not delayed to this + * reference time instant, then this aggregate may be unjustly served + * before other aggregates waiting for service. This may cause the + * above bound to the slot index to be violated for some of these + * unlucky aggregates. + * + * Instead of delaying the activation of the new aggregate, which is + * quite complex, the following inaccurate but simple solution is used: + * if the slot index is higher than QFQ_MAX_SLOTS-2, then the + * timestamps of the aggregate are shifted backward so as to let the + * slot index become equal to QFQ_MAX_SLOTS-2. */ -static void qfq_slot_insert(struct qfq_group *grp, struct qfq_class *cl, +static void qfq_slot_insert(struct qfq_group *grp, struct qfq_aggregate *agg, u64 roundedS) { u64 slot = (roundedS - grp->S) >> grp->slot_shift; - unsigned int i = (grp->front + slot) % QFQ_MAX_SLOTS; + unsigned int i; /* slot index in the bucket list */ + + if (unlikely(slot > QFQ_MAX_SLOTS - 2)) { + u64 deltaS = roundedS - grp->S - + ((u64)(QFQ_MAX_SLOTS - 2)<<grp->slot_shift); + agg->S -= deltaS; + agg->F -= deltaS; + slot = QFQ_MAX_SLOTS - 2; + } + + i = (grp->front + slot) % QFQ_MAX_SLOTS; - hlist_add_head(&cl->next, &grp->slots[i]); + hlist_add_head(&agg->next, &grp->slots[i]); __set_bit(slot, &grp->full_slots); } /* Maybe introduce hlist_first_entry?? */ -static struct qfq_class *qfq_slot_head(struct qfq_group *grp) +static struct qfq_aggregate *qfq_slot_head(struct qfq_group *grp) { return hlist_entry(grp->slots[grp->front].first, - struct qfq_class, next); + struct qfq_aggregate, next); } /* @@ -689,20 +892,20 @@ static struct qfq_class *qfq_slot_head(struct qfq_group *grp) */ static void qfq_front_slot_remove(struct qfq_group *grp) { - struct qfq_class *cl = qfq_slot_head(grp); + struct qfq_aggregate *agg = qfq_slot_head(grp); - BUG_ON(!cl); - hlist_del(&cl->next); + BUG_ON(!agg); + hlist_del(&agg->next); if (hlist_empty(&grp->slots[grp->front])) __clear_bit(0, &grp->full_slots); } /* - * Returns the first full queue in a group. As a side effect, - * adjust the bucket list so the first non-empty bucket is at - * position 0 in full_slots. + * Returns the first aggregate in the first non-empty bucket of the + * group. As a side effect, adjusts the bucket list so the first + * non-empty bucket is at position 0 in full_slots. */ -static struct qfq_class *qfq_slot_scan(struct qfq_group *grp) +static struct qfq_aggregate *qfq_slot_scan(struct qfq_group *grp) { unsigned int i; @@ -738,7 +941,7 @@ static void qfq_slot_rotate(struct qfq_group *grp, u64 roundedS) grp->front = (grp->front - i) % QFQ_MAX_SLOTS; } -static void qfq_update_eligible(struct qfq_sched *q, u64 old_V) +static void qfq_update_eligible(struct qfq_sched *q) { struct qfq_group *grp; unsigned long ineligible; @@ -750,137 +953,246 @@ static void qfq_update_eligible(struct qfq_sched *q, u64 old_V) if (qfq_gt(grp->S, q->V)) q->V = grp->S; } - qfq_make_eligible(q, old_V); + qfq_make_eligible(q); } } -/* - * Updates the class, returns true if also the group needs to be updated. - */ -static bool qfq_update_class(struct qfq_group *grp, struct qfq_class *cl) +/* Dequeue head packet of the head class in the DRR queue of the aggregate. */ +static void agg_dequeue(struct qfq_aggregate *agg, + struct qfq_class *cl, unsigned int len) { - unsigned int len = qdisc_peek_len(cl->qdisc); - - cl->S = cl->F; - if (!len) - qfq_front_slot_remove(grp); /* queue is empty */ - else { - u64 roundedS; + qdisc_dequeue_peeked(cl->qdisc); - cl->F = cl->S + (u64)len * cl->inv_w; - roundedS = qfq_round_down(cl->S, grp->slot_shift); - if (roundedS == grp->S) - return false; + cl->deficit -= (int) len; - qfq_front_slot_remove(grp); - qfq_slot_insert(grp, cl, roundedS); + if (cl->qdisc->q.qlen == 0) /* no more packets, remove from list */ + list_del(&cl->alist); + else if (cl->deficit < qdisc_pkt_len(cl->qdisc->ops->peek(cl->qdisc))) { + cl->deficit += agg->lmax; + list_move_tail(&cl->alist, &agg->active); } +} + +static inline struct sk_buff *qfq_peek_skb(struct qfq_aggregate *agg, + struct qfq_class **cl, + unsigned int *len) +{ + struct sk_buff *skb; + + *cl = list_first_entry(&agg->active, struct qfq_class, alist); + skb = (*cl)->qdisc->ops->peek((*cl)->qdisc); + if (skb == NULL) + WARN_ONCE(1, "qfq_dequeue: non-workconserving leaf\n"); + else + *len = qdisc_pkt_len(skb); + + return skb; +} - return true; +/* Update F according to the actual service received by the aggregate. */ +static inline void charge_actual_service(struct qfq_aggregate *agg) +{ + /* Compute the service received by the aggregate, taking into + * account that, after decreasing the number of classes in + * agg, it may happen that + * agg->initial_budget - agg->budget > agg->bugdetmax + */ + u32 service_received = min(agg->budgetmax, + agg->initial_budget - agg->budget); + + agg->F = agg->S + (u64)service_received * agg->inv_w; } +static inline void qfq_update_agg_ts(struct qfq_sched *q, + struct qfq_aggregate *agg, + enum update_reason reason); + +static void qfq_schedule_agg(struct qfq_sched *q, struct qfq_aggregate *agg); + static struct sk_buff *qfq_dequeue(struct Qdisc *sch) { struct qfq_sched *q = qdisc_priv(sch); - struct qfq_group *grp; + struct qfq_aggregate *in_serv_agg = q->in_serv_agg; struct qfq_class *cl; - struct sk_buff *skb; - unsigned int len; - u64 old_V; + struct sk_buff *skb = NULL; + /* next-packet len, 0 means no more active classes in in-service agg */ + unsigned int len = 0; - if (!q->bitmaps[ER]) + if (in_serv_agg == NULL) return NULL; - grp = qfq_ffs(q, q->bitmaps[ER]); + if (!list_empty(&in_serv_agg->active)) + skb = qfq_peek_skb(in_serv_agg, &cl, &len); - cl = qfq_slot_head(grp); - skb = qdisc_dequeue_peeked(cl->qdisc); - if (!skb) { - WARN_ONCE(1, "qfq_dequeue: non-workconserving leaf\n"); - return NULL; + /* + * If there are no active classes in the in-service aggregate, + * or if the aggregate has not enough budget to serve its next + * class, then choose the next aggregate to serve. + */ + if (len == 0 || in_serv_agg->budget < len) { + charge_actual_service(in_serv_agg); + + /* recharge the budget of the aggregate */ + in_serv_agg->initial_budget = in_serv_agg->budget = + in_serv_agg->budgetmax; + + if (!list_empty(&in_serv_agg->active)) { + /* + * Still active: reschedule for + * service. Possible optimization: if no other + * aggregate is active, then there is no point + * in rescheduling this aggregate, and we can + * just keep it as the in-service one. This + * should be however a corner case, and to + * handle it, we would need to maintain an + * extra num_active_aggs field. + */ + qfq_update_agg_ts(q, in_serv_agg, requeue); + qfq_schedule_agg(q, in_serv_agg); + } else if (sch->q.qlen == 0) { /* no aggregate to serve */ + q->in_serv_agg = NULL; + return NULL; + } + + /* + * If we get here, there are other aggregates queued: + * choose the new aggregate to serve. + */ + in_serv_agg = q->in_serv_agg = qfq_choose_next_agg(q); + skb = qfq_peek_skb(in_serv_agg, &cl, &len); } + if (!skb) + return NULL; sch->q.qlen--; qdisc_bstats_update(sch, skb); - old_V = q->V; - len = qdisc_pkt_len(skb); + agg_dequeue(in_serv_agg, cl, len); + /* If lmax is lowered, through qfq_change_class, for a class + * owning pending packets with larger size than the new value + * of lmax, then the following condition may hold. + */ + if (unlikely(in_serv_agg->budget < len)) + in_serv_agg->budget = 0; + else + in_serv_agg->budget -= len; + q->V += (u64)len * IWSUM; pr_debug("qfq dequeue: len %u F %lld now %lld\n", - len, (unsigned long long) cl->F, (unsigned long long) q->V); + len, (unsigned long long) in_serv_agg->F, + (unsigned long long) q->V); - if (qfq_update_class(grp, cl)) { - u64 old_F = grp->F; + return skb; +} - cl = qfq_slot_scan(grp); - if (!cl) - __clear_bit(grp->index, &q->bitmaps[ER]); - else { - u64 roundedS = qfq_round_down(cl->S, grp->slot_shift); - unsigned int s; +static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *q) +{ + struct qfq_group *grp; + struct qfq_aggregate *agg, *new_front_agg; + u64 old_F; - if (grp->S == roundedS) - goto skip_unblock; - grp->S = roundedS; - grp->F = roundedS + (2ULL << grp->slot_shift); - __clear_bit(grp->index, &q->bitmaps[ER]); - s = qfq_calc_state(q, grp); - __set_bit(grp->index, &q->bitmaps[s]); - } + qfq_update_eligible(q); + q->oldV = q->V; - qfq_unblock_groups(q, grp->index, old_F); + if (!q->bitmaps[ER]) + return NULL; + + grp = qfq_ffs(q, q->bitmaps[ER]); + old_F = grp->F; + + agg = qfq_slot_head(grp); + + /* agg starts to be served, remove it from schedule */ + qfq_front_slot_remove(grp); + + new_front_agg = qfq_slot_scan(grp); + + if (new_front_agg == NULL) /* group is now inactive, remove from ER */ + __clear_bit(grp->index, &q->bitmaps[ER]); + else { + u64 roundedS = qfq_round_down(new_front_agg->S, + grp->slot_shift); + unsigned int s; + + if (grp->S == roundedS) + return agg; + grp->S = roundedS; + grp->F = roundedS + (2ULL << grp->slot_shift); + __clear_bit(grp->index, &q->bitmaps[ER]); + s = qfq_calc_state(q, grp); + __set_bit(grp->index, &q->bitmaps[s]); } -skip_unblock: - qfq_update_eligible(q, old_V); + qfq_unblock_groups(q, grp->index, old_F); - return skb; + return agg; } /* - * Assign a reasonable start time for a new flow k in group i. + * Assign a reasonable start time for a new aggregate in group i. * Admissible values for \hat(F) are multiples of \sigma_i * no greater than V+\sigma_i . Larger values mean that * we had a wraparound so we consider the timestamp to be stale. * * If F is not stale and F >= V then we set S = F. * Otherwise we should assign S = V, but this may violate - * the ordering in ER. So, if we have groups in ER, set S to - * the F_j of the first group j which would be blocking us. + * the ordering in EB (see [2]). So, if we have groups in ER, + * set S to the F_j of the first group j which would be blocking us. * We are guaranteed not to move S backward because * otherwise our group i would still be blocked. */ -static void qfq_update_start(struct qfq_sched *q, struct qfq_class *cl) +static void qfq_update_start(struct qfq_sched *q, struct qfq_aggregate *agg) { unsigned long mask; u64 limit, roundedF; - int slot_shift = cl->grp->slot_shift; + int slot_shift = agg->grp->slot_shift; - roundedF = qfq_round_down(cl->F, slot_shift); + roundedF = qfq_round_down(agg->F, slot_shift); limit = qfq_round_down(q->V, slot_shift) + (1ULL << slot_shift); - if (!qfq_gt(cl->F, q->V) || qfq_gt(roundedF, limit)) { + if (!qfq_gt(agg->F, q->V) || qfq_gt(roundedF, limit)) { /* timestamp was stale */ - mask = mask_from(q->bitmaps[ER], cl->grp->index); + mask = mask_from(q->bitmaps[ER], agg->grp->index); if (mask) { struct qfq_group *next = qfq_ffs(q, mask); if (qfq_gt(roundedF, next->F)) { if (qfq_gt(limit, next->F)) - cl->S = next->F; + agg->S = next->F; else /* preserve timestamp correctness */ - cl->S = limit; + agg->S = limit; return; } } - cl->S = q->V; + agg->S = q->V; } else /* timestamp is not stale */ - cl->S = cl->F; + agg->S = agg->F; } +/* + * Update the timestamps of agg before scheduling/rescheduling it for + * service. In particular, assign to agg->F its maximum possible + * value, i.e., the virtual finish time with which the aggregate + * should be labeled if it used all its budget once in service. + */ +static inline void +qfq_update_agg_ts(struct qfq_sched *q, + struct qfq_aggregate *agg, enum update_reason reason) +{ + if (reason != requeue) + qfq_update_start(q, agg); + else /* just charge agg for the service received */ + agg->S = agg->F; + + agg->F = agg->S + (u64)agg->budgetmax * agg->inv_w; +} + +static void qfq_schedule_agg(struct qfq_sched *, struct qfq_aggregate *); + static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct qfq_sched *q = qdisc_priv(sch); struct qfq_class *cl; + struct qfq_aggregate *agg; int err = 0; cl = qfq_classify(skb, sch, &err); @@ -892,6 +1204,15 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) } pr_debug("qfq_enqueue: cl = %x\n", cl->common.classid); + if (unlikely(cl->agg->lmax < qdisc_pkt_len(skb))) { + pr_debug("qfq: increasing maxpkt from %u to %u for class %u", + cl->agg->lmax, qdisc_pkt_len(skb), cl->common.classid); + err = qfq_change_agg(sch, cl, cl->agg->class_weight, + qdisc_pkt_len(skb)); + if (err) + return err; + } + err = qdisc_enqueue(skb, cl->qdisc); if (unlikely(err != NET_XMIT_SUCCESS)) { pr_debug("qfq_enqueue: enqueue failed %d\n", err); @@ -905,35 +1226,44 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) bstats_update(&cl->bstats, skb); ++sch->q.qlen; - /* If the new skb is not the head of queue, then done here. */ - if (cl->qdisc->q.qlen != 1) + agg = cl->agg; + /* if the queue was not empty, then done here */ + if (cl->qdisc->q.qlen != 1) { + if (unlikely(skb == cl->qdisc->ops->peek(cl->qdisc)) && + list_first_entry(&agg->active, struct qfq_class, alist) + == cl && cl->deficit < qdisc_pkt_len(skb)) + list_move_tail(&cl->alist, &agg->active); + return err; + } + + /* schedule class for service within the aggregate */ + cl->deficit = agg->lmax; + list_add_tail(&cl->alist, &agg->active); - /* If reach this point, queue q was idle */ - qfq_activate_class(q, cl, qdisc_pkt_len(skb)); + if (list_first_entry(&agg->active, struct qfq_class, alist) != cl || + q->in_serv_agg == agg) + return err; /* non-empty or in service, nothing else to do */ + + qfq_activate_agg(q, agg, enqueue); return err; } /* - * Handle class switch from idle to backlogged. + * Schedule aggregate according to its timestamps. */ -static void qfq_activate_class(struct qfq_sched *q, struct qfq_class *cl, - unsigned int pkt_len) +static void qfq_schedule_agg(struct qfq_sched *q, struct qfq_aggregate *agg) { - struct qfq_group *grp = cl->grp; + struct qfq_group *grp = agg->grp; u64 roundedS; int s; - qfq_update_start(q, cl); - - /* compute new finish time and rounded start. */ - cl->F = cl->S + (u64)pkt_len * cl->inv_w; - roundedS = qfq_round_down(cl->S, grp->slot_shift); + roundedS = qfq_round_down(agg->S, grp->slot_shift); /* - * insert cl in the correct bucket. - * If cl->S >= grp->S we don't need to adjust the + * Insert agg in the correct bucket. + * If agg->S >= grp->S we don't need to adjust the * bucket list and simply go to the insertion phase. * Otherwise grp->S is decreasing, we must make room * in the bucket list, and also recompute the group state. @@ -941,15 +1271,16 @@ static void qfq_activate_class(struct qfq_sched *q, struct qfq_class *cl, * was in ER make sure to adjust V. */ if (grp->full_slots) { - if (!qfq_gt(grp->S, cl->S)) + if (!qfq_gt(grp->S, agg->S)) goto skip_update; - /* create a slot for this cl->S */ + /* create a slot for this agg->S */ qfq_slot_rotate(grp, roundedS); /* group was surely ineligible, remove */ __clear_bit(grp->index, &q->bitmaps[IR]); __clear_bit(grp->index, &q->bitmaps[IB]); - } else if (!q->bitmaps[ER] && qfq_gt(roundedS, q->V)) + } else if (!q->bitmaps[ER] && qfq_gt(roundedS, q->V) && + q->in_serv_agg == NULL) q->V = roundedS; grp->S = roundedS; @@ -959,46 +1290,68 @@ static void qfq_activate_class(struct qfq_sched *q, struct qfq_class *cl, pr_debug("qfq enqueue: new state %d %#lx S %lld F %lld V %lld\n", s, q->bitmaps[s], - (unsigned long long) cl->S, - (unsigned long long) cl->F, + (unsigned long long) agg->S, + (unsigned long long) agg->F, (unsigned long long) q->V); skip_update: - qfq_slot_insert(grp, cl, roundedS); + qfq_slot_insert(grp, agg, roundedS); } +/* Update agg ts and schedule agg for service */ +static void qfq_activate_agg(struct qfq_sched *q, struct qfq_aggregate *agg, + enum update_reason reason) +{ + agg->initial_budget = agg->budget = agg->budgetmax; /* recharge budg. */ + + qfq_update_agg_ts(q, agg, reason); + if (q->in_serv_agg == NULL) { /* no aggr. in service or scheduled */ + q->in_serv_agg = agg; /* start serving this aggregate */ + /* update V: to be in service, agg must be eligible */ + q->oldV = q->V = agg->S; + } else if (agg != q->in_serv_agg) + qfq_schedule_agg(q, agg); +} + static void qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp, - struct qfq_class *cl) + struct qfq_aggregate *agg) { unsigned int i, offset; u64 roundedS; - roundedS = qfq_round_down(cl->S, grp->slot_shift); + roundedS = qfq_round_down(agg->S, grp->slot_shift); offset = (roundedS - grp->S) >> grp->slot_shift; + i = (grp->front + offset) % QFQ_MAX_SLOTS; - hlist_del(&cl->next); + hlist_del(&agg->next); if (hlist_empty(&grp->slots[i])) __clear_bit(offset, &grp->full_slots); } /* - * called to forcibly destroy a queue. - * If the queue is not in the front bucket, or if it has - * other queues in the front bucket, we can simply remove - * the queue with no other side effects. + * Called to forcibly deschedule an aggregate. If the aggregate is + * not in the front bucket, or if the latter has other aggregates in + * the front bucket, we can simply remove the aggregate with no other + * side effects. * Otherwise we must propagate the event up. */ -static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl) +static void qfq_deactivate_agg(struct qfq_sched *q, struct qfq_aggregate *agg) { - struct qfq_group *grp = cl->grp; + struct qfq_group *grp = agg->grp; unsigned long mask; u64 roundedS; int s; - cl->F = cl->S; - qfq_slot_remove(q, grp, cl); + if (agg == q->in_serv_agg) { + charge_actual_service(agg); + q->in_serv_agg = qfq_choose_next_agg(q); + return; + } + + agg->F = agg->S; + qfq_slot_remove(q, grp, agg); if (!grp->full_slots) { __clear_bit(grp->index, &q->bitmaps[IR]); @@ -1017,8 +1370,8 @@ static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl) } __clear_bit(grp->index, &q->bitmaps[ER]); } else if (hlist_empty(&grp->slots[grp->front])) { - cl = qfq_slot_scan(grp); - roundedS = qfq_round_down(cl->S, grp->slot_shift); + agg = qfq_slot_scan(grp); + roundedS = qfq_round_down(agg->S, grp->slot_shift); if (grp->S != roundedS) { __clear_bit(grp->index, &q->bitmaps[ER]); __clear_bit(grp->index, &q->bitmaps[IR]); @@ -1030,8 +1383,6 @@ static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl) __set_bit(grp->index, &q->bitmaps[s]); } } - - qfq_update_eligible(q, q->V); } static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg) @@ -1043,6 +1394,31 @@ static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg) qfq_deactivate_class(q, cl); } +static unsigned int qfq_drop_from_slot(struct qfq_sched *q, + struct hlist_head *slot) +{ + struct qfq_aggregate *agg; + struct qfq_class *cl; + unsigned int len; + + hlist_for_each_entry(agg, slot, next) { + list_for_each_entry(cl, &agg->active, alist) { + + if (!cl->qdisc->ops->drop) + continue; + + len = cl->qdisc->ops->drop(cl->qdisc); + if (len > 0) { + if (cl->qdisc->q.qlen == 0) + qfq_deactivate_class(q, cl); + + return len; + } + } + } + return 0; +} + static unsigned int qfq_drop(struct Qdisc *sch) { struct qfq_sched *q = qdisc_priv(sch); @@ -1052,24 +1428,13 @@ static unsigned int qfq_drop(struct Qdisc *sch) for (i = 0; i <= QFQ_MAX_INDEX; i++) { grp = &q->groups[i]; for (j = 0; j < QFQ_MAX_SLOTS; j++) { - struct qfq_class *cl; - struct hlist_node *n; - - hlist_for_each_entry(cl, n, &grp->slots[j], next) { - - if (!cl->qdisc->ops->drop) - continue; - - len = cl->qdisc->ops->drop(cl->qdisc); - if (len > 0) { - sch->q.qlen--; - if (!cl->qdisc->q.qlen) - qfq_deactivate_class(q, cl); - - return len; - } + len = qfq_drop_from_slot(q, &grp->slots[j]); + if (len > 0) { + sch->q.qlen--; + return len; } } + } return 0; @@ -1080,44 +1445,50 @@ static int qfq_init_qdisc(struct Qdisc *sch, struct nlattr *opt) struct qfq_sched *q = qdisc_priv(sch); struct qfq_group *grp; int i, j, err; + u32 max_cl_shift, maxbudg_shift, max_classes; err = qdisc_class_hash_init(&q->clhash); if (err < 0) return err; + if (qdisc_dev(sch)->tx_queue_len + 1 > QFQ_MAX_AGG_CLASSES) + max_classes = QFQ_MAX_AGG_CLASSES; + else + max_classes = qdisc_dev(sch)->tx_queue_len + 1; + /* max_cl_shift = floor(log_2(max_classes)) */ + max_cl_shift = __fls(max_classes); + q->max_agg_classes = 1<<max_cl_shift; + + /* maxbudg_shift = log2(max_len * max_classes_per_agg) */ + maxbudg_shift = QFQ_MTU_SHIFT + max_cl_shift; + q->min_slot_shift = FRAC_BITS + maxbudg_shift - QFQ_MAX_INDEX; + for (i = 0; i <= QFQ_MAX_INDEX; i++) { grp = &q->groups[i]; grp->index = i; - grp->slot_shift = QFQ_MTU_SHIFT + FRAC_BITS - - (QFQ_MAX_INDEX - i); + grp->slot_shift = q->min_slot_shift + i; for (j = 0; j < QFQ_MAX_SLOTS; j++) INIT_HLIST_HEAD(&grp->slots[j]); } + INIT_HLIST_HEAD(&q->nonfull_aggs); + return 0; } static void qfq_reset_qdisc(struct Qdisc *sch) { struct qfq_sched *q = qdisc_priv(sch); - struct qfq_group *grp; struct qfq_class *cl; - struct hlist_node *n, *tmp; - unsigned int i, j; + unsigned int i; - for (i = 0; i <= QFQ_MAX_INDEX; i++) { - grp = &q->groups[i]; - for (j = 0; j < QFQ_MAX_SLOTS; j++) { - hlist_for_each_entry_safe(cl, n, tmp, - &grp->slots[j], next) { + for (i = 0; i < q->clhash.hashsize; i++) { + hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) { + if (cl->qdisc->q.qlen > 0) qfq_deactivate_class(q, cl); - } - } - } - for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) qdisc_reset(cl->qdisc); + } } sch->q.qlen = 0; } @@ -1126,13 +1497,13 @@ static void qfq_destroy_qdisc(struct Qdisc *sch) { struct qfq_sched *q = qdisc_priv(sch); struct qfq_class *cl; - struct hlist_node *n, *next; + struct hlist_node *next; unsigned int i; tcf_destroy_chain(&q->filter_list); for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[i], + hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i], common.hnode) { qfq_destroy_class(sch, cl); } diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 4b056c15e90c..c8388f3c3426 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -19,6 +19,7 @@ #include <linux/errno.h> #include <linux/skbuff.h> #include <net/netlink.h> +#include <net/sch_generic.h> #include <net/pkt_sched.h> @@ -100,23 +101,21 @@ struct tbf_sched_data { /* Parameters */ u32 limit; /* Maximal length of backlog: bytes */ - u32 buffer; /* Token bucket depth/rate: MUST BE >= MTU/B */ - u32 mtu; + s64 buffer; /* Token bucket depth/rate: MUST BE >= MTU/B */ + s64 mtu; u32 max_size; - struct qdisc_rate_table *R_tab; - struct qdisc_rate_table *P_tab; + struct psched_ratecfg rate; + struct psched_ratecfg peak; + bool peak_present; /* Variables */ - long tokens; /* Current number of B tokens */ - long ptokens; /* Current number of P tokens */ - psched_time_t t_c; /* Time check-point */ + s64 tokens; /* Current number of B tokens */ + s64 ptokens; /* Current number of P tokens */ + s64 t_c; /* Time check-point */ struct Qdisc *qdisc; /* Inner qdisc, default - bfifo queue */ struct qdisc_watchdog watchdog; /* Watchdog timer */ }; -#define L2T(q, L) qdisc_l2t((q)->R_tab, L) -#define L2T_P(q, L) qdisc_l2t((q)->P_tab, L) - static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct tbf_sched_data *q = qdisc_priv(sch); @@ -156,24 +155,24 @@ static struct sk_buff *tbf_dequeue(struct Qdisc *sch) skb = q->qdisc->ops->peek(q->qdisc); if (skb) { - psched_time_t now; - long toks; - long ptoks = 0; + s64 now; + s64 toks; + s64 ptoks = 0; unsigned int len = qdisc_pkt_len(skb); - now = psched_get_time(); - toks = psched_tdiff_bounded(now, q->t_c, q->buffer); + now = ktime_to_ns(ktime_get()); + toks = min_t(s64, now - q->t_c, q->buffer); - if (q->P_tab) { + if (q->peak_present) { ptoks = toks + q->ptokens; - if (ptoks > (long)q->mtu) + if (ptoks > q->mtu) ptoks = q->mtu; - ptoks -= L2T_P(q, len); + ptoks -= (s64) psched_l2t_ns(&q->peak, len); } toks += q->tokens; - if (toks > (long)q->buffer) + if (toks > q->buffer) toks = q->buffer; - toks -= L2T(q, len); + toks -= (s64) psched_l2t_ns(&q->rate, len); if ((toks|ptoks) >= 0) { skb = qdisc_dequeue_peeked(q->qdisc); @@ -189,8 +188,8 @@ static struct sk_buff *tbf_dequeue(struct Qdisc *sch) return skb; } - qdisc_watchdog_schedule(&q->watchdog, - now + max_t(long, -toks, -ptoks)); + qdisc_watchdog_schedule_ns(&q->watchdog, + now + max_t(long, -toks, -ptoks)); /* Maybe we have a shorter packet in the queue, which can be sent now. It sounds cool, @@ -214,7 +213,7 @@ static void tbf_reset(struct Qdisc *sch) qdisc_reset(q->qdisc); sch->q.qlen = 0; - q->t_c = psched_get_time(); + q->t_c = ktime_to_ns(ktime_get()); q->tokens = q->buffer; q->ptokens = q->mtu; qdisc_watchdog_cancel(&q->watchdog); @@ -293,14 +292,19 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt) q->qdisc = child; } q->limit = qopt->limit; - q->mtu = qopt->mtu; + q->mtu = PSCHED_TICKS2NS(qopt->mtu); q->max_size = max_size; - q->buffer = qopt->buffer; + q->buffer = PSCHED_TICKS2NS(qopt->buffer); q->tokens = q->buffer; q->ptokens = q->mtu; - swap(q->R_tab, rtab); - swap(q->P_tab, ptab); + psched_ratecfg_precompute(&q->rate, rtab->rate.rate); + if (ptab) { + psched_ratecfg_precompute(&q->peak, ptab->rate.rate); + q->peak_present = true; + } else { + q->peak_present = false; + } sch_tree_unlock(sch); err = 0; @@ -319,7 +323,7 @@ static int tbf_init(struct Qdisc *sch, struct nlattr *opt) if (opt == NULL) return -EINVAL; - q->t_c = psched_get_time(); + q->t_c = ktime_to_ns(ktime_get()); qdisc_watchdog_init(&q->watchdog, sch); q->qdisc = &noop_qdisc; @@ -331,12 +335,6 @@ static void tbf_destroy(struct Qdisc *sch) struct tbf_sched_data *q = qdisc_priv(sch); qdisc_watchdog_cancel(&q->watchdog); - - if (q->P_tab) - qdisc_put_rtab(q->P_tab); - if (q->R_tab) - qdisc_put_rtab(q->R_tab); - qdisc_destroy(q->qdisc); } @@ -352,13 +350,13 @@ static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb) goto nla_put_failure; opt.limit = q->limit; - opt.rate = q->R_tab->rate; - if (q->P_tab) - opt.peakrate = q->P_tab->rate; + opt.rate.rate = psched_ratecfg_getrate(&q->rate); + if (q->peak_present) + opt.peakrate.rate = psched_ratecfg_getrate(&q->peak); else memset(&opt.peakrate, 0, sizeof(opt.peakrate)); - opt.mtu = q->mtu; - opt.buffer = q->buffer; + opt.mtu = PSCHED_NS2TICKS(q->mtu); + opt.buffer = PSCHED_NS2TICKS(q->buffer); if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt)) goto nla_put_failure; |