#include <linux/crush/hash.h>
#include <linux/crush/mapper.h>
-char *ceph_osdmap_state_str(char *str, int len, int state)
+char *ceph_osdmap_state_str(char *str, int len, u32 state)
{
if (!len)
return str;
return -EINVAL;
}
-static int skip_name_map(void **p, void *end)
+static struct crush_choose_arg_map *alloc_choose_arg_map(void)
{
- int len;
- ceph_decode_32_safe(p, end, len ,bad);
- while (len--) {
- int strlen;
- *p += sizeof(u32);
- ceph_decode_32_safe(p, end, strlen, bad);
- *p += strlen;
+ struct crush_choose_arg_map *arg_map;
+
+ arg_map = kzalloc(sizeof(*arg_map), GFP_NOIO);
+ if (!arg_map)
+ return NULL;
+
+ RB_CLEAR_NODE(&arg_map->node);
+ return arg_map;
}
- return 0;
-bad:
- return -EINVAL;
+
+static void free_choose_arg_map(struct crush_choose_arg_map *arg_map)
+{
+ if (arg_map) {
+ int i, j;
+
+ WARN_ON(!RB_EMPTY_NODE(&arg_map->node));
+
+ for (i = 0; i < arg_map->size; i++) {
+ struct crush_choose_arg *arg = &arg_map->args[i];
+
+ for (j = 0; j < arg->weight_set_size; j++)
+ kfree(arg->weight_set[j].weights);
+ kfree(arg->weight_set);
+ kfree(arg->ids);
+ }
+ kfree(arg_map->args);
+ kfree(arg_map);
+ }
+}
+
+DEFINE_RB_FUNCS(choose_arg_map, struct crush_choose_arg_map, choose_args_index,
+ node);
+
+void clear_choose_args(struct crush_map *c)
+{
+ while (!RB_EMPTY_ROOT(&c->choose_args)) {
+ struct crush_choose_arg_map *arg_map =
+ rb_entry(rb_first(&c->choose_args),
+ struct crush_choose_arg_map, node);
+
+ erase_choose_arg_map(&c->choose_args, arg_map);
+ free_choose_arg_map(arg_map);
+ }
+}
+
+static u32 *decode_array_32_alloc(void **p, void *end, u32 *plen)
+{
+ u32 *a = NULL;
+ u32 len;
+ int ret;
+
+ ceph_decode_32_safe(p, end, len, e_inval);
+ if (len) {
+ u32 i;
+
+ a = kmalloc_array(len, sizeof(u32), GFP_NOIO);
+ if (!a) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ ceph_decode_need(p, end, len * sizeof(u32), e_inval);
+ for (i = 0; i < len; i++)
+ a[i] = ceph_decode_32(p);
+ }
+
+ *plen = len;
+ return a;
+
+e_inval:
+ ret = -EINVAL;
+fail:
+ kfree(a);
+ return ERR_PTR(ret);
+}
+
+/*
+ * Assumes @arg is zero-initialized.
+ */
+static int decode_choose_arg(void **p, void *end, struct crush_choose_arg *arg)
+{
+ int ret;
+
+ ceph_decode_32_safe(p, end, arg->weight_set_size, e_inval);
+ if (arg->weight_set_size) {
+ u32 i;
+
+ arg->weight_set = kmalloc_array(arg->weight_set_size,
+ sizeof(*arg->weight_set),
+ GFP_NOIO);
+ if (!arg->weight_set)
+ return -ENOMEM;
+
+ for (i = 0; i < arg->weight_set_size; i++) {
+ struct crush_weight_set *w = &arg->weight_set[i];
+
+ w->weights = decode_array_32_alloc(p, end, &w->size);
+ if (IS_ERR(w->weights)) {
+ ret = PTR_ERR(w->weights);
+ w->weights = NULL;
+ return ret;
+ }
+ }
+ }
+
+ arg->ids = decode_array_32_alloc(p, end, &arg->ids_size);
+ if (IS_ERR(arg->ids)) {
+ ret = PTR_ERR(arg->ids);
+ arg->ids = NULL;
+ return ret;
+ }
+
+ return 0;
+
+e_inval:
+ return -EINVAL;
+}
+
+static int decode_choose_args(void **p, void *end, struct crush_map *c)
+{
+ struct crush_choose_arg_map *arg_map = NULL;
+ u32 num_choose_arg_maps, num_buckets;
+ int ret;
+
+ ceph_decode_32_safe(p, end, num_choose_arg_maps, e_inval);
+ while (num_choose_arg_maps--) {
+ arg_map = alloc_choose_arg_map();
+ if (!arg_map) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ ceph_decode_64_safe(p, end, arg_map->choose_args_index,
+ e_inval);
+ arg_map->size = c->max_buckets;
+ arg_map->args = kcalloc(arg_map->size, sizeof(*arg_map->args),
+ GFP_NOIO);
+ if (!arg_map->args) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ ceph_decode_32_safe(p, end, num_buckets, e_inval);
+ while (num_buckets--) {
+ struct crush_choose_arg *arg;
+ u32 bucket_index;
+
+ ceph_decode_32_safe(p, end, bucket_index, e_inval);
+ if (bucket_index >= arg_map->size)
+ goto e_inval;
+
+ arg = &arg_map->args[bucket_index];
+ ret = decode_choose_arg(p, end, arg);
+ if (ret)
+ goto fail;
+ }
+
+ insert_choose_arg_map(&c->choose_args, arg_map);
+ }
+
+ return 0;
+
+e_inval:
+ ret = -EINVAL;
+fail:
+ free_choose_arg_map(arg_map);
+ return ret;
}
static void crush_finalize(struct crush_map *c)
void **p = &pbyval;
void *start = pbyval;
u32 magic;
- u32 num_name_maps;
dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
if (c == NULL)
return ERR_PTR(-ENOMEM);
+ c->choose_args = RB_ROOT;
+
/* set tunables to default values */
c->choose_local_tries = 2;
c->choose_local_fallback_tries = 5;
}
}
- /* ignore trailing name maps. */
- for (num_name_maps = 0; num_name_maps < 3; num_name_maps++) {
- err = skip_name_map(p, end);
- if (err < 0)
- goto done;
- }
+ ceph_decode_skip_map(p, end, 32, string, bad); /* type_map */
+ ceph_decode_skip_map(p, end, 32, string, bad); /* name_map */
+ ceph_decode_skip_map(p, end, 32, string, bad); /* rule_name_map */
/* tunables */
ceph_decode_need(p, end, 3*sizeof(u32), done);
dout("crush decode tunable chooseleaf_stable = %d\n",
c->chooseleaf_stable);
+ if (*p != end) {
+ /* class_map */
+ ceph_decode_skip_map(p, end, 32, 32, bad);
+ /* class_name */
+ ceph_decode_skip_map(p, end, 32, string, bad);
+ /* class_bucket */
+ ceph_decode_skip_map_of_map(p, end, 32, 32, 32, bad);
+ }
+
+ if (*p != end) {
+ err = decode_choose_args(p, end, c);
+ if (err)
+ goto bad;
+ }
+
done:
crush_finalize(c);
dout("crush_decode success\n");
map->pool_max = -1;
map->pg_temp = RB_ROOT;
map->primary_temp = RB_ROOT;
+ map->pg_upmap = RB_ROOT;
+ map->pg_upmap_items = RB_ROOT;
mutex_init(&map->crush_workspace_mutex);
return map;
erase_pg_mapping(&map->primary_temp, pg);
free_pg_mapping(pg);
}
+ while (!RB_EMPTY_ROOT(&map->pg_upmap)) {
+ struct ceph_pg_mapping *pg =
+ rb_entry(rb_first(&map->pg_upmap),
+ struct ceph_pg_mapping, node);
+ rb_erase(&pg->node, &map->pg_upmap);
+ kfree(pg);
+ }
+ while (!RB_EMPTY_ROOT(&map->pg_upmap_items)) {
+ struct ceph_pg_mapping *pg =
+ rb_entry(rb_first(&map->pg_upmap_items),
+ struct ceph_pg_mapping, node);
+ rb_erase(&pg->node, &map->pg_upmap_items);
+ kfree(pg);
+ }
while (!RB_EMPTY_ROOT(&map->pg_pools)) {
struct ceph_pg_pool_info *pi =
rb_entry(rb_first(&map->pg_pools),
*/
static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
{
- u8 *state;
+ u32 *state;
u32 *weight;
struct ceph_entity_addr *addr;
int i;
return -EINVAL;
}
+static struct ceph_pg_mapping *__decode_pg_upmap(void **p, void *end,
+ bool __unused)
+{
+ return __decode_pg_temp(p, end, false);
+}
+
+static int decode_pg_upmap(void **p, void *end, struct ceph_osdmap *map)
+{
+ return decode_pg_mapping(p, end, &map->pg_upmap, __decode_pg_upmap,
+ false);
+}
+
+static int decode_new_pg_upmap(void **p, void *end, struct ceph_osdmap *map)
+{
+ return decode_pg_mapping(p, end, &map->pg_upmap, __decode_pg_upmap,
+ true);
+}
+
+static int decode_old_pg_upmap(void **p, void *end, struct ceph_osdmap *map)
+{
+ return decode_pg_mapping(p, end, &map->pg_upmap, NULL, true);
+}
+
+static struct ceph_pg_mapping *__decode_pg_upmap_items(void **p, void *end,
+ bool __unused)
+{
+ struct ceph_pg_mapping *pg;
+ u32 len, i;
+
+ ceph_decode_32_safe(p, end, len, e_inval);
+ if (len > (SIZE_MAX - sizeof(*pg)) / (2 * sizeof(u32)))
+ return ERR_PTR(-EINVAL);
+
+ ceph_decode_need(p, end, 2 * len * sizeof(u32), e_inval);
+ pg = kzalloc(sizeof(*pg) + 2 * len * sizeof(u32), GFP_NOIO);
+ if (!pg)
+ return ERR_PTR(-ENOMEM);
+
+ pg->pg_upmap_items.len = len;
+ for (i = 0; i < len; i++) {
+ pg->pg_upmap_items.from_to[i][0] = ceph_decode_32(p);
+ pg->pg_upmap_items.from_to[i][1] = ceph_decode_32(p);
+ }
+
+ return pg;
+
+e_inval:
+ return ERR_PTR(-EINVAL);
+}
+
+static int decode_pg_upmap_items(void **p, void *end, struct ceph_osdmap *map)
+{
+ return decode_pg_mapping(p, end, &map->pg_upmap_items,
+ __decode_pg_upmap_items, false);
+}
+
+static int decode_new_pg_upmap_items(void **p, void *end,
+ struct ceph_osdmap *map)
+{
+ return decode_pg_mapping(p, end, &map->pg_upmap_items,
+ __decode_pg_upmap_items, true);
+}
+
+static int decode_old_pg_upmap_items(void **p, void *end,
+ struct ceph_osdmap *map)
+{
+ return decode_pg_mapping(p, end, &map->pg_upmap_items, NULL, true);
+}
+
/*
* decode a full map.
*/
/* osd_state, osd_weight, osd_addrs->client_addr */
ceph_decode_need(p, end, 3*sizeof(u32) +
- map->max_osd*(1 + sizeof(*map->osd_weight) +
+ map->max_osd*((struct_v >= 5 ? sizeof(u32) :
+ sizeof(u8)) +
+ sizeof(*map->osd_weight) +
sizeof(*map->osd_addr)), e_inval);
if (ceph_decode_32(p) != map->max_osd)
goto e_inval;
- ceph_decode_copy(p, map->osd_state, map->max_osd);
+ if (struct_v >= 5) {
+ for (i = 0; i < map->max_osd; i++)
+ map->osd_state[i] = ceph_decode_32(p);
+ } else {
+ for (i = 0; i < map->max_osd; i++)
+ map->osd_state[i] = ceph_decode_8(p);
+ }
if (ceph_decode_32(p) != map->max_osd)
goto e_inval;
if (err)
goto bad;
} else {
- /* XXX can this happen? */
- kfree(map->osd_primary_affinity);
- map->osd_primary_affinity = NULL;
+ WARN_ON(map->osd_primary_affinity);
}
/* crush */
if (err)
goto bad;
+ *p += len;
+ if (struct_v >= 3) {
+ /* erasure_code_profiles */
+ ceph_decode_skip_map_of_map(p, end, string, string, string,
+ bad);
+ }
+
+ if (struct_v >= 4) {
+ err = decode_pg_upmap(p, end, map);
+ if (err)
+ goto bad;
+
+ err = decode_pg_upmap_items(p, end, map);
+ if (err)
+ goto bad;
+ } else {
+ WARN_ON(!RB_EMPTY_ROOT(&map->pg_upmap));
+ WARN_ON(!RB_EMPTY_ROOT(&map->pg_upmap_items));
+ }
+
/* ignore the rest */
*p = end;
* new_up_client: { osd=6, addr=... } # set osd_state and addr
* new_state: { osd=6, xorstate=EXISTS } # clear osd_state
*/
-static int decode_new_up_state_weight(void **p, void *end,
+static int decode_new_up_state_weight(void **p, void *end, u8 struct_v,
struct ceph_osdmap *map)
{
void *new_up_client;
new_state = *p;
ceph_decode_32_safe(p, end, len, e_inval);
- len *= sizeof(u32) + sizeof(u8);
+ len *= sizeof(u32) + (struct_v >= 5 ? sizeof(u32) : sizeof(u8));
ceph_decode_need(p, end, len, e_inval);
*p += len;
len = ceph_decode_32(p);
while (len--) {
s32 osd;
- u8 xorstate;
+ u32 xorstate;
int ret;
osd = ceph_decode_32(p);
- xorstate = ceph_decode_8(p);
+ if (struct_v >= 5)
+ xorstate = ceph_decode_32(p);
+ else
+ xorstate = ceph_decode_8(p);
if (xorstate == 0)
xorstate = CEPH_OSD_UP;
BUG_ON(osd >= map->max_osd);
}
/* new_up_client, new_state, new_weight */
- err = decode_new_up_state_weight(p, end, map);
+ err = decode_new_up_state_weight(p, end, struct_v, map);
if (err)
goto bad;
goto bad;
}
+ if (struct_v >= 3) {
+ /* new_erasure_code_profiles */
+ ceph_decode_skip_map_of_map(p, end, string, string, string,
+ bad);
+ /* old_erasure_code_profiles */
+ ceph_decode_skip_set(p, end, string, bad);
+ }
+
+ if (struct_v >= 4) {
+ err = decode_new_pg_upmap(p, end, map);
+ if (err)
+ goto bad;
+
+ err = decode_old_pg_upmap(p, end, map);
+ if (err)
+ goto bad;
+
+ err = decode_new_pg_upmap_items(p, end, map);
+ if (err)
+ goto bad;
+
+ err = decode_old_pg_upmap_items(p, end, map);
+ if (err)
+ goto bad;
+ }
+
/* ignore the rest */
*p = end;
static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
int *result, int result_max,
- const __u32 *weight, int weight_max)
+ const __u32 *weight, int weight_max,
+ u64 choose_args_index)
{
+ struct crush_choose_arg_map *arg_map;
int r;
BUG_ON(result_max > CEPH_PG_MAX_SIZE);
+ arg_map = lookup_choose_arg_map(&map->crush->choose_args,
+ choose_args_index);
+
mutex_lock(&map->crush_workspace_mutex);
r = crush_do_rule(map->crush, ruleno, x, result, result_max,
- weight, weight_max, map->crush_workspace);
+ weight, weight_max, map->crush_workspace,
+ arg_map ? arg_map->args : NULL);
mutex_unlock(&map->crush_workspace_mutex);
return r;
}
+static void remove_nonexistent_osds(struct ceph_osdmap *osdmap,
+ struct ceph_pg_pool_info *pi,
+ struct ceph_osds *set)
+{
+ int i;
+
+ if (ceph_can_shift_osds(pi)) {
+ int removed = 0;
+
+ /* shift left */
+ for (i = 0; i < set->size; i++) {
+ if (!ceph_osd_exists(osdmap, set->osds[i])) {
+ removed++;
+ continue;
+ }
+ if (removed)
+ set->osds[i - removed] = set->osds[i];
+ }
+ set->size -= removed;
+ } else {
+ /* set dne devices to NONE */
+ for (i = 0; i < set->size; i++) {
+ if (!ceph_osd_exists(osdmap, set->osds[i]))
+ set->osds[i] = CRUSH_ITEM_NONE;
+ }
+ }
+}
+
/*
- * Calculate raw set (CRUSH output) for given PG. The result may
- * contain nonexistent OSDs. ->primary is undefined for a raw set.
+ * Calculate raw set (CRUSH output) for given PG and filter out
+ * nonexistent OSDs. ->primary is undefined for a raw set.
*
* Placement seed (CRUSH input) is returned through @ppps.
*/
}
len = do_crush(osdmap, ruleno, pps, raw->osds, pi->size,
- osdmap->osd_weight, osdmap->max_osd);
+ osdmap->osd_weight, osdmap->max_osd, pi->id);
if (len < 0) {
pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n",
len, ruleno, pi->id, pi->crush_ruleset, pi->type,
}
raw->size = len;
+ remove_nonexistent_osds(osdmap, pi, raw);
+}
+
+/* apply pg_upmap[_items] mappings */
+static void apply_upmap(struct ceph_osdmap *osdmap,
+ const struct ceph_pg *pgid,
+ struct ceph_osds *raw)
+{
+ struct ceph_pg_mapping *pg;
+ int i, j;
+
+ pg = lookup_pg_mapping(&osdmap->pg_upmap, pgid);
+ if (pg) {
+ /* make sure targets aren't marked out */
+ for (i = 0; i < pg->pg_upmap.len; i++) {
+ int osd = pg->pg_upmap.osds[i];
+
+ if (osd != CRUSH_ITEM_NONE &&
+ osd < osdmap->max_osd &&
+ osdmap->osd_weight[osd] == 0) {
+ /* reject/ignore explicit mapping */
+ return;
+ }
+ }
+ for (i = 0; i < pg->pg_upmap.len; i++)
+ raw->osds[i] = pg->pg_upmap.osds[i];
+ raw->size = pg->pg_upmap.len;
+ return;
+ }
+
+ pg = lookup_pg_mapping(&osdmap->pg_upmap_items, pgid);
+ if (pg) {
+ /*
+ * Note: this approach does not allow a bidirectional swap,
+ * e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1].
+ */
+ for (i = 0; i < pg->pg_upmap_items.len; i++) {
+ int from = pg->pg_upmap_items.from_to[i][0];
+ int to = pg->pg_upmap_items.from_to[i][1];
+ int pos = -1;
+ bool exists = false;
+
+ /* make sure replacement doesn't already appear */
+ for (j = 0; j < raw->size; j++) {
+ int osd = raw->osds[j];
+
+ if (osd == to) {
+ exists = true;
+ break;
+ }
+ /* ignore mapping if target is marked out */
+ if (osd == from && pos < 0 &&
+ !(to != CRUSH_ITEM_NONE &&
+ to < osdmap->max_osd &&
+ osdmap->osd_weight[to] == 0)) {
+ pos = j;
+ }
+ }
+ if (!exists && pos >= 0) {
+ raw->osds[pos] = to;
+ return;
+ }
+ }
+ }
}
/*
*/
static void get_temp_osds(struct ceph_osdmap *osdmap,
struct ceph_pg_pool_info *pi,
- const struct ceph_pg *raw_pgid,
+ const struct ceph_pg *pgid,
struct ceph_osds *temp)
{
- struct ceph_pg pgid;
struct ceph_pg_mapping *pg;
int i;
- raw_pg_to_pg(pi, raw_pgid, &pgid);
ceph_osds_init(temp);
/* pg_temp? */
- pg = lookup_pg_mapping(&osdmap->pg_temp, &pgid);
+ pg = lookup_pg_mapping(&osdmap->pg_temp, pgid);
if (pg) {
for (i = 0; i < pg->pg_temp.len; i++) {
if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) {
}
/* primary_temp? */
- pg = lookup_pg_mapping(&osdmap->primary_temp, &pgid);
+ pg = lookup_pg_mapping(&osdmap->primary_temp, pgid);
if (pg)
temp->primary = pg->primary_temp.osd;
}
struct ceph_osds *up,
struct ceph_osds *acting)
{
+ struct ceph_pg pgid;
u32 pps;
WARN_ON(pi->id != raw_pgid->pool);
+ raw_pg_to_pg(pi, raw_pgid, &pgid);
pg_to_raw_osds(osdmap, pi, raw_pgid, up, &pps);
+ apply_upmap(osdmap, &pgid, up);
raw_to_up_osds(osdmap, pi, up);
apply_primary_affinity(osdmap, pi, pps, up);
- get_temp_osds(osdmap, pi, raw_pgid, acting);
+ get_temp_osds(osdmap, pi, &pgid, acting);
if (!acting->size) {
memcpy(acting->osds, up->osds, up->size * sizeof(up->osds[0]));
acting->size = up->size;