#define SEC_XFER_SIZE 512
#define EXTRACT_SIZE 10
- #define DEBUG_RANDOM_BOOT 0
#define LONGS(x) (((x) + sizeof(unsigned long) - 1)/sizeof(unsigned long))
static void _crng_backtrack_protect(struct crng_state *crng,
__u8 tmp[CHACHA20_BLOCK_SIZE], int used);
static void process_random_ready_list(void);
+ static void _get_random_bytes(void *buf, int nbytes);
/**********************************************************************
*
_extract_entropy(&input_pool, &crng->state[4],
sizeof(__u32) * 12, 0);
else
- get_random_bytes(&crng->state[4], sizeof(__u32) * 12);
+ _get_random_bytes(&crng->state[4], sizeof(__u32) * 12);
for (i = 4; i < 16; i++) {
if (!arch_get_random_seed_long(&rv) &&
!arch_get_random_long(&rv))
}
}
- static inline void crng_wait_ready(void)
- {
- wait_event_interruptible(crng_init_wait, crng_ready());
- }
-
static void _extract_crng(struct crng_state *crng,
__u8 out[CHACHA20_BLOCK_SIZE])
{
unsigned long time = random_get_entropy() ^ jiffies;
unsigned long flags;
+ if (!crng_ready()) {
+ crng_fast_load(buf, size);
+ return;
+ }
+
trace_add_device_randomness(size, _RET_IP_);
spin_lock_irqsave(&input_pool.lock, flags);
_mix_pool_bytes(&input_pool, buf, size);
return ret;
}
+ #define warn_unseeded_randomness(previous) \
+ _warn_unseeded_randomness(__func__, (void *) _RET_IP_, (previous))
+
+ static void _warn_unseeded_randomness(const char *func_name, void *caller,
+ void **previous)
+ {
+ #ifdef CONFIG_WARN_ALL_UNSEEDED_RANDOM
+ const bool print_once = false;
+ #else
+ static bool print_once __read_mostly;
+ #endif
+
+ if (print_once ||
+ crng_ready() ||
+ (previous && (caller == READ_ONCE(*previous))))
+ return;
+ WRITE_ONCE(*previous, caller);
+ #ifndef CONFIG_WARN_ALL_UNSEEDED_RANDOM
+ print_once = true;
+ #endif
+ pr_notice("random: %s called from %pF with crng_init=%d\n",
+ func_name, caller, crng_init);
+ }
+
/*
* This function is the exported kernel interface. It returns some
* number of good random numbers, suitable for key generation, seeding
* TCP sequence numbers, etc. It does not rely on the hardware random
* number generator. For random bytes direct from the hardware RNG
- * (when available), use get_random_bytes_arch().
+ * (when available), use get_random_bytes_arch(). In order to ensure
+ * that the randomness provided by this function is okay, the function
+ * wait_for_random_bytes() should be called and return 0 at least once
+ * at any point prior.
*/
- void get_random_bytes(void *buf, int nbytes)
+ static void _get_random_bytes(void *buf, int nbytes)
{
__u8 tmp[CHACHA20_BLOCK_SIZE];
- #if DEBUG_RANDOM_BOOT > 0
- if (!crng_ready())
- printk(KERN_NOTICE "random: %pF get_random_bytes called "
- "with crng_init = %d\n", (void *) _RET_IP_, crng_init);
- #endif
trace_get_random_bytes(nbytes, _RET_IP_);
while (nbytes >= CHACHA20_BLOCK_SIZE) {
crng_backtrack_protect(tmp, CHACHA20_BLOCK_SIZE);
memzero_explicit(tmp, sizeof(tmp));
}
+
+ void get_random_bytes(void *buf, int nbytes)
+ {
+ static void *previous;
+
+ warn_unseeded_randomness(&previous);
+ _get_random_bytes(buf, nbytes);
+ }
EXPORT_SYMBOL(get_random_bytes);
+ /*
+ * Wait for the urandom pool to be seeded and thus guaranteed to supply
+ * cryptographically secure random numbers. This applies to: the /dev/urandom
+ * device, the get_random_bytes function, and the get_random_{u32,u64,int,long}
+ * family of functions. Using any of these functions without first calling
+ * this function forfeits the guarantee of security.
+ *
+ * Returns: 0 if the urandom pool has been seeded.
+ * -ERESTARTSYS if the function was interrupted by a signal.
+ */
+ int wait_for_random_bytes(void)
+ {
+ if (likely(crng_ready()))
+ return 0;
+ return wait_event_interruptible(crng_init_wait, crng_ready());
+ }
+ EXPORT_SYMBOL(wait_for_random_bytes);
+
/*
* Add a callback function that will be invoked when the nonblocking
* pool is initialised.
SYSCALL_DEFINE3(getrandom, char __user *, buf, size_t, count,
unsigned int, flags)
{
+ int ret;
+
if (flags & ~(GRND_NONBLOCK|GRND_RANDOM))
return -EINVAL;
if (!crng_ready()) {
if (flags & GRND_NONBLOCK)
return -EAGAIN;
- crng_wait_ready();
- if (signal_pending(current))
- return -ERESTARTSYS;
+ ret = wait_for_random_bytes();
+ if (unlikely(ret))
+ return ret;
}
return urandom_read(NULL, buf, count, NULL);
}
/*
* Get a random word for internal kernel use only. The quality of the random
* number is either as good as RDRAND or as good as /dev/urandom, with the
- * goal of being quite fast and not depleting entropy.
+ * goal of being quite fast and not depleting entropy. In order to ensure
+ * that the randomness provided by this function is okay, the function
+ * wait_for_random_bytes() should be called and return 0 at least once
+ * at any point prior.
*/
static DEFINE_PER_CPU(struct batched_entropy, batched_entropy_u64);
u64 get_random_u64(void)
{
u64 ret;
- bool use_lock = READ_ONCE(crng_init) < 2;
+ bool use_lock;
unsigned long flags = 0;
struct batched_entropy *batch;
+ static void *previous;
#if BITS_PER_LONG == 64
if (arch_get_random_long((unsigned long *)&ret))
return ret;
#endif
+ warn_unseeded_randomness(&previous);
+
+ use_lock = READ_ONCE(crng_init) < 2;
batch = &get_cpu_var(batched_entropy_u64);
if (use_lock)
read_lock_irqsave(&batched_entropy_reset_lock, flags);
u32 get_random_u32(void)
{
u32 ret;
- bool use_lock = READ_ONCE(crng_init) < 2;
+ bool use_lock;
unsigned long flags = 0;
struct batched_entropy *batch;
+ static void *previous;
if (arch_get_random_int(&ret))
return ret;
+ warn_unseeded_randomness(&previous);
+
+ use_lock = READ_ONCE(crng_init) < 2;
batch = &get_cpu_var(batched_entropy_u32);
if (use_lock)
read_lock_irqsave(&batched_entropy_reset_lock, flags);
return 0;
}
- static void iscsi_login_set_conn_values(
+ static int iscsi_login_set_conn_values(
struct iscsi_session *sess,
struct iscsi_conn *conn,
__be16 cid)
{
+ int ret;
conn->sess = sess;
conn->cid = be16_to_cpu(cid);
/*
* Generate a random Status sequence number (statsn) for the new
* iSCSI connection.
*/
- get_random_bytes(&conn->stat_sn, sizeof(u32));
+ ret = get_random_bytes_wait(&conn->stat_sn, sizeof(u32));
+ if (unlikely(ret))
+ return ret;
mutex_lock(&auth_id_lock);
conn->auth_id = iscsit_global->auth_id++;
mutex_unlock(&auth_id_lock);
+ return 0;
}
__printf(2, 3) int iscsi_change_param_sprintf(
return -ENOMEM;
}
- iscsi_login_set_conn_values(sess, conn, pdu->cid);
+ ret = iscsi_login_set_conn_values(sess, conn, pdu->cid);
+ if (unlikely(ret)) {
+ kfree(sess);
+ return ret;
+ }
sess->init_task_tag = pdu->itt;
memcpy(&sess->isid, pdu->isid, 6);
sess->exp_cmd_sn = be32_to_cpu(pdu->cmdsn);
{
struct iscsi_login_req *pdu = (struct iscsi_login_req *)buf;
- iscsi_login_set_conn_values(NULL, conn, pdu->cid);
- return 0;
+ return iscsi_login_set_conn_values(NULL, conn, pdu->cid);
}
/*
atomic_set(&sess->session_continuation, 1);
spin_unlock_bh(&sess->conn_lock);
- iscsi_login_set_conn_values(sess, conn, pdu->cid);
-
- if (iscsi_copy_param_list(&conn->param_list,
+ if (iscsi_login_set_conn_values(sess, conn, pdu->cid) < 0 ||
+ iscsi_copy_param_list(&conn->param_list,
conn->tpg->param_list, 0) < 0) {
iscsit_tx_login_rsp(conn, ISCSI_STATUS_CLS_TARGET_ERR,
ISCSI_LOGIN_STATUS_NO_RESOURCES);
break;
}
+ while (!kthread_should_stop()) {
+ msleep(100);
+ }
+
return 0;
}
#include <linux/key-type.h>
#include "cifs_spnego.h"
#include "fscache.h"
-#ifdef CONFIG_CIFS_SMB2
#include "smb2pdu.h"
-#endif
int cifsFYI = 0;
bool traceSMB;
cifs_inode->uniqueid = 0;
cifs_inode->createtime = 0;
cifs_inode->epoch = 0;
-#ifdef CONFIG_CIFS_SMB2
generate_random_uuid(cifs_inode->lease_key);
-#endif
+
/*
* Can not set i_flags here - they get immediately overwritten to zero
* by the VFS.
static int
cifs_init_request_bufs(void)
{
- size_t max_hdr_size = MAX_CIFS_HDR_SIZE;
-#ifdef CONFIG_CIFS_SMB2
/*
* SMB2 maximum header size is bigger than CIFS one - no problems to
* allocate some more bytes for CIFS.
*/
- max_hdr_size = MAX_SMB2_HDR_SIZE;
-#endif
+ size_t max_hdr_size = MAX_SMB2_HDR_SIZE;
+
if (CIFSMaxBufSize < 8192) {
/* Buffer size can not be smaller than 2 * PATH_MAX since maximum
Unicode path name has to fit in any SMB/CIFS path based frames */
spin_lock_init(&cifs_tcp_ses_lock);
spin_lock_init(&GlobalMid_Lock);
- get_random_bytes(&cifs_lock_secret, sizeof(cifs_lock_secret));
+ cifs_lock_secret = get_random_u32();
if (cifs_max_pending < 2) {
cifs_max_pending = 2;
MODULE_SOFTDEP("pre: md4");
MODULE_SOFTDEP("pre: md5");
MODULE_SOFTDEP("pre: nls");
-#ifdef CONFIG_CIFS_SMB2
MODULE_SOFTDEP("pre: aes");
MODULE_SOFTDEP("pre: cmac");
MODULE_SOFTDEP("pre: sha256");
MODULE_SOFTDEP("pre: aead2");
MODULE_SOFTDEP("pre: ccm");
-#endif /* CONFIG_CIFS_SMB2 */
module_init(init_cifs)
module_exit(exit_cifs)
extern void add_interrupt_randomness(int irq, int irq_flags) __latent_entropy;
extern void get_random_bytes(void *buf, int nbytes);
+ extern int wait_for_random_bytes(void);
extern int add_random_ready_callback(struct random_ready_callback *rdy);
extern void del_random_ready_callback(struct random_ready_callback *rdy);
extern void get_random_bytes_arch(void *buf, int nbytes);
#endif
}
+/*
+ * On 64-bit architectures, protect against non-terminated C string overflows
+ * by zeroing out the first byte of the canary; this leaves 56 bits of entropy.
+ */
+#ifdef CONFIG_64BIT
+# ifdef __LITTLE_ENDIAN
+# define CANARY_MASK 0xffffffffffffff00UL
+# else /* big endian, 64 bits: */
+# define CANARY_MASK 0x00ffffffffffffffUL
+# endif
+#else /* 32 bits: */
+# define CANARY_MASK 0xffffffffUL
+#endif
+
+static inline unsigned long get_random_canary(void)
+{
+ unsigned long val = get_random_long();
+
+ return val & CANARY_MASK;
+}
+
+ /* Calls wait_for_random_bytes() and then calls get_random_bytes(buf, nbytes).
+ * Returns the result of the call to wait_for_random_bytes. */
+ static inline int get_random_bytes_wait(void *buf, int nbytes)
+ {
+ int ret = wait_for_random_bytes();
+ if (unlikely(ret))
+ return ret;
+ get_random_bytes(buf, nbytes);
+ return 0;
+ }
+
+ #define declare_get_random_var_wait(var) \
+ static inline int get_random_ ## var ## _wait(var *out) { \
+ int ret = wait_for_random_bytes(); \
+ if (unlikely(ret)) \
+ return ret; \
+ *out = get_random_ ## var(); \
+ return 0; \
+ }
+ declare_get_random_var_wait(u32)
+ declare_get_random_var_wait(u64)
+ declare_get_random_var_wait(int)
+ declare_get_random_var_wait(long)
+ #undef declare_get_random_var
+
unsigned long randomize_page(unsigned long start, unsigned long range);
u32 prandom_u32(void);
write to these files.
For detailed documentation on the debugfs API, see
- Documentation/DocBook/filesystems.
+ Documentation/filesystems/.
If unsure, say N.
menu "Debug Lockups and Hangs"
config LOCKUP_DETECTOR
- bool "Detect Hard and Soft Lockups"
+ bool
+
+config SOFTLOCKUP_DETECTOR
+ bool "Detect Soft Lockups"
depends on DEBUG_KERNEL && !S390
+ select LOCKUP_DETECTOR
help
Say Y here to enable the kernel to act as a watchdog to detect
- hard and soft lockups.
+ soft lockups.
Softlockups are bugs that cause the kernel to loop in kernel
mode for more than 20 seconds, without giving other tasks a
chance to run. The current stack trace is displayed upon
detection and the system will stay locked up.
+config HARDLOCKUP_DETECTOR_PERF
+ bool
+ select SOFTLOCKUP_DETECTOR
+
+#
+# arch/ can define HAVE_HARDLOCKUP_DETECTOR_ARCH to provide their own hard
+# lockup detector rather than the perf based detector.
+#
+config HARDLOCKUP_DETECTOR
+ bool "Detect Hard Lockups"
+ depends on DEBUG_KERNEL && !S390
+ depends on HAVE_HARDLOCKUP_DETECTOR_PERF || HAVE_HARDLOCKUP_DETECTOR_ARCH
+ select LOCKUP_DETECTOR
+ select HARDLOCKUP_DETECTOR_PERF if HAVE_HARDLOCKUP_DETECTOR_PERF
+ select HARDLOCKUP_DETECTOR_ARCH if HAVE_HARDLOCKUP_DETECTOR_ARCH
+ help
+ Say Y here to enable the kernel to act as a watchdog to detect
+ hard lockups.
+
Hardlockups are bugs that cause the CPU to loop in kernel mode
for more than 10 seconds, without letting other interrupts have a
chance to run. The current stack trace is displayed upon detection
and the system will stay locked up.
- The overhead should be minimal. A periodic hrtimer runs to
- generate interrupts and kick the watchdog task every 4 seconds.
- An NMI is generated every 10 seconds or so to check for hardlockups.
-
- The frequency of hrtimer and NMI events and the soft and hard lockup
- thresholds can be controlled through the sysctl watchdog_thresh.
-
-config HARDLOCKUP_DETECTOR
- def_bool y
- depends on LOCKUP_DETECTOR && !HAVE_NMI_WATCHDOG
- depends on PERF_EVENTS && HAVE_PERF_EVENTS_NMI
-
config BOOTPARAM_HARDLOCKUP_PANIC
bool "Panic (Reboot) On Hard Lockups"
depends on HARDLOCKUP_DETECTOR
config BOOTPARAM_SOFTLOCKUP_PANIC
bool "Panic (Reboot) On Soft Lockups"
- depends on LOCKUP_DETECTOR
+ depends on SOFTLOCKUP_DETECTOR
help
Say Y here to enable the kernel to panic on "soft lockups",
which are bugs that cause the kernel to loop in kernel
config BOOTPARAM_SOFTLOCKUP_PANIC_VALUE
int
- depends on LOCKUP_DETECTOR
+ depends on SOFTLOCKUP_DETECTOR
range 0 1
default 0 if !BOOTPARAM_SOFTLOCKUP_PANIC
default 1 if BOOTPARAM_SOFTLOCKUP_PANIC
config DETECT_HUNG_TASK
bool "Detect Hung Tasks"
depends on DEBUG_KERNEL
- default LOCKUP_DETECTOR
+ default SOFTLOCKUP_DETECTOR
help
Say Y here to enable the kernel to detect "hung tasks",
which are bugs that cause the task to be stuck in
depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT
select DEBUG_SPINLOCK
select DEBUG_MUTEXES
+ select DEBUG_RT_MUTEXES if RT_MUTEXES
select LOCKDEP
help
This feature will check whether any held lock (spinlock, rwlock,
select LOCKDEP
select DEBUG_SPINLOCK
select DEBUG_MUTEXES
+ select DEBUG_RT_MUTEXES if RT_MUTEXES
select DEBUG_LOCK_ALLOC
select TRACE_IRQFLAGS
default n
select LOCKDEP
select DEBUG_SPINLOCK
select DEBUG_MUTEXES
+ select DEBUG_RT_MUTEXES if RT_MUTEXES
select DEBUG_LOCK_ALLOC
default n
help
It is also used by various kernel debugging features that require
stack trace generation.
+ config WARN_ALL_UNSEEDED_RANDOM
+ bool "Warn for all uses of unseeded randomness"
+ default n
+ help
+ Some parts of the kernel contain bugs relating to their use of
+ cryptographically secure random numbers before it's actually possible
+ to generate those numbers securely. This setting ensures that these
+ flaws don't go unnoticed, by enabling a message, should this ever
+ occur. This will allow people with obscure setups to know when things
+ are going wrong, so that they might contact developers about fixing
+ it.
+
+ Unfortunately, on some models of some architectures getting
+ a fully seeded CRNG is extremely difficult, and so this can
+ result in dmesg getting spammed for a surprisingly long
+ time. This is really bad from a security perspective, and
+ so architecture maintainers really need to do what they can
+ to get the CRNG seeded sooner after the system is booted.
+ However, since users can not do anything actionble to
+ address this, by default the kernel will issue only a single
+ warning for the first use of unseeded randomness.
+
+ Say Y here if you want to receive warnings for all uses of
+ unseeded randomness. This will be of use primarily for
+ those developers interersted in improving the security of
+ Linux kernels running on their architecture (or
+ subarchitecture).
+
config DEBUG_KOBJECT
bool "kobject debugging"
depends on DEBUG_KERNEL
If unsure, say N.
-menu "RCU Debugging"
-
-config PROVE_RCU
- def_bool PROVE_LOCKING
-
-config PROVE_RCU_REPEATEDLY
- bool "RCU debugging: don't disable PROVE_RCU on first splat"
- depends on PROVE_RCU
- default n
- help
- By itself, PROVE_RCU will disable checking upon issuing the
- first warning (or "splat"). This feature prevents such
- disabling, allowing multiple RCU-lockdep warnings to be printed
- on a single reboot.
-
- Say Y to allow multiple RCU-lockdep warnings per boot.
-
- Say N if you are unsure.
-
-config SPARSE_RCU_POINTER
- bool "RCU debugging: sparse-based checks for pointer usage"
- default n
- help
- This feature enables the __rcu sparse annotation for
- RCU-protected pointers. This annotation will cause sparse
- to flag any non-RCU used of annotated pointers. This can be
- helpful when debugging RCU usage. Please note that this feature
- is not intended to enforce code cleanliness; it is instead merely
- a debugging aid.
-
- Say Y to make sparse flag questionable use of RCU-protected pointers
-
- Say N if you are unsure.
-
-config TORTURE_TEST
- tristate
- default n
-
-config RCU_PERF_TEST
- tristate "performance tests for RCU"
- depends on DEBUG_KERNEL
- select TORTURE_TEST
- select SRCU
- select TASKS_RCU
- default n
- help
- This option provides a kernel module that runs performance
- tests on the RCU infrastructure. The kernel module may be built
- after the fact on the running kernel to be tested, if desired.
-
- Say Y here if you want RCU performance tests to be built into
- the kernel.
- Say M if you want the RCU performance tests to build as a module.
- Say N if you are unsure.
-
-config RCU_TORTURE_TEST
- tristate "torture tests for RCU"
- depends on DEBUG_KERNEL
- select TORTURE_TEST
- select SRCU
- select TASKS_RCU
- default n
- help
- This option provides a kernel module that runs torture tests
- on the RCU infrastructure. The kernel module may be built
- after the fact on the running kernel to be tested, if desired.
-
- Say Y here if you want RCU torture tests to be built into
- the kernel.
- Say M if you want the RCU torture tests to build as a module.
- Say N if you are unsure.
-
-config RCU_TORTURE_TEST_SLOW_PREINIT
- bool "Slow down RCU grace-period pre-initialization to expose races"
- depends on RCU_TORTURE_TEST
- help
- This option delays grace-period pre-initialization (the
- propagation of CPU-hotplug changes up the rcu_node combining
- tree) for a few jiffies between initializing each pair of
- consecutive rcu_node structures. This helps to expose races
- involving grace-period pre-initialization, in other words, it
- makes your kernel less stable. It can also greatly increase
- grace-period latency, especially on systems with large numbers
- of CPUs. This is useful when torture-testing RCU, but in
- almost no other circumstance.
-
- Say Y here if you want your system to crash and hang more often.
- Say N if you want a sane system.
-
-config RCU_TORTURE_TEST_SLOW_PREINIT_DELAY
- int "How much to slow down RCU grace-period pre-initialization"
- range 0 5
- default 3
- depends on RCU_TORTURE_TEST_SLOW_PREINIT
- help
- This option specifies the number of jiffies to wait between
- each rcu_node structure pre-initialization step.
-
-config RCU_TORTURE_TEST_SLOW_INIT
- bool "Slow down RCU grace-period initialization to expose races"
- depends on RCU_TORTURE_TEST
- help
- This option delays grace-period initialization for a few
- jiffies between initializing each pair of consecutive
- rcu_node structures. This helps to expose races involving
- grace-period initialization, in other words, it makes your
- kernel less stable. It can also greatly increase grace-period
- latency, especially on systems with large numbers of CPUs.
- This is useful when torture-testing RCU, but in almost no
- other circumstance.
-
- Say Y here if you want your system to crash and hang more often.
- Say N if you want a sane system.
-
-config RCU_TORTURE_TEST_SLOW_INIT_DELAY
- int "How much to slow down RCU grace-period initialization"
- range 0 5
- default 3
- depends on RCU_TORTURE_TEST_SLOW_INIT
- help
- This option specifies the number of jiffies to wait between
- each rcu_node structure initialization.
-
-config RCU_TORTURE_TEST_SLOW_CLEANUP
- bool "Slow down RCU grace-period cleanup to expose races"
- depends on RCU_TORTURE_TEST
- help
- This option delays grace-period cleanup for a few jiffies
- between cleaning up each pair of consecutive rcu_node
- structures. This helps to expose races involving grace-period
- cleanup, in other words, it makes your kernel less stable.
- It can also greatly increase grace-period latency, especially
- on systems with large numbers of CPUs. This is useful when
- torture-testing RCU, but in almost no other circumstance.
-
- Say Y here if you want your system to crash and hang more often.
- Say N if you want a sane system.
-
-config RCU_TORTURE_TEST_SLOW_CLEANUP_DELAY
- int "How much to slow down RCU grace-period cleanup"
- range 0 5
- default 3
- depends on RCU_TORTURE_TEST_SLOW_CLEANUP
- help
- This option specifies the number of jiffies to wait between
- each rcu_node structure cleanup operation.
-
-config RCU_CPU_STALL_TIMEOUT
- int "RCU CPU stall timeout in seconds"
- depends on RCU_STALL_COMMON
- range 3 300
- default 21
- help
- If a given RCU grace period extends more than the specified
- number of seconds, a CPU stall warning is printed. If the
- RCU grace period persists, additional CPU stall warnings are
- printed at more widely spaced intervals.
-
-config RCU_TRACE
- bool "Enable tracing for RCU"
- depends on DEBUG_KERNEL
- default y if TREE_RCU
- select TRACE_CLOCK
- help
- This option provides tracing in RCU which presents stats
- in debugfs for debugging RCU implementation. It also enables
- additional tracepoints for ftrace-style event tracing.
-
- Say Y here if you want to enable RCU tracing
- Say N if you are unsure.
-
-config RCU_EQS_DEBUG
- bool "Provide debugging asserts for adding NO_HZ support to an arch"
- depends on DEBUG_KERNEL
- help
- This option provides consistency checks in RCU's handling of
- NO_HZ. These checks have proven quite helpful in detecting
- bugs in arch-specific NO_HZ code.
-
- Say N here if you need ultimate kernel/user switch latencies
- Say Y if you are unsure
-
-endmenu # "RCU Debugging"
+source "kernel/rcu/Kconfig.debug"
config DEBUG_WQ_FORCE_RR_CPU
bool "Force round-robin CPU selection for unbound work items"
config INTERVAL_TREE_TEST
tristate "Interval tree test"
- depends on m && DEBUG_KERNEL
+ depends on DEBUG_KERNEL
select INTERVAL_TREE
help
A benchmark measuring the performance of the interval tree library
If unsure, say N.
+config TEST_SYSCTL
+ tristate "sysctl test driver"
+ default n
+ depends on PROC_SYSCTL
+ help
+ This builds the "test_sysctl" module. This driver enables to test the
+ proc sysctl interfaces available to drivers safely without affecting
+ production knobs which might alter system functionality.
+
+ If unsure, say N.
+
config TEST_UDELAY
tristate "udelay test driver"
default n
If unsure, say N.
+config TEST_KMOD
+ tristate "kmod stress tester"
+ default n
+ depends on m
+ depends on BLOCK && (64BIT || LBDAF) # for XFS, BTRFS
+ depends on NETDEVICES && NET_CORE && INET # for TUN
+ select TEST_LKM
+ select XFS_FS
+ select TUN
+ select BTRFS_FS
+ help
+ Test the kernel's module loading mechanism: kmod. kmod implements
+ support to load modules using the Linux kernel's usermode helper.
+ This test provides a series of tests against kmod.
+
+ Although technically you can either build test_kmod as a module or
+ into the kernel we disallow building it into the kernel since
+ it stress tests request_module() and this will very likely cause
+ some issues by taking over precious threads available from other
+ module load requests, ultimately this could be fatal.
+
+ To run tests run:
+
+ tools/testing/selftests/kmod/kmod.sh --help
+
+ If unsure, say N.
+
source "samples/Kconfig"
source "lib/Kconfig.kgdb"
int i;
size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]);
- if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER) ||
- gfp != GFP_KERNEL)
+ if (gfp != GFP_KERNEL)
tbl = kzalloc(size, gfp | __GFP_NOWARN | __GFP_NORETRY);
- if (tbl == NULL && gfp == GFP_KERNEL)
- tbl = vzalloc(size);
+ else
+ tbl = kvzalloc(size, gfp);
size = nbuckets;
INIT_LIST_HEAD(&tbl->walkers);
- get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd));
+ tbl->hash_rnd = get_random_u32();
for (i = 0; i < nbuckets; i++)
INIT_RHT_NULLS_HEAD(tbl->buckets[i], ht, i);
module_param_cb(supported_features, ¶m_ops_supported_features, NULL,
S_IRUGO);
-/*
- * find filename portion of a path (/foo/bar/baz -> baz)
- */
-const char *ceph_file_part(const char *s, int len)
-{
- const char *e = s + len;
-
- while (e != s && *(e-1) != '/')
- e--;
- return e;
-}
-EXPORT_SYMBOL(ceph_file_part);
-
const char *ceph_msg_type_name(int type)
{
switch (type) {
case CEPH_MSG_OSD_OP: return "osd_op";
case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
case CEPH_MSG_WATCH_NOTIFY: return "watch_notify";
+ case CEPH_MSG_OSD_BACKOFF: return "osd_backoff";
default: return "unknown";
}
}
{
struct ceph_client *client;
struct ceph_entity_addr *myaddr = NULL;
- int err = -ENOMEM;
+ int err;
+
+ err = wait_for_random_bytes();
+ if (err < 0)
+ return ERR_PTR(err);
client = kzalloc(sizeof(*client), GFP_KERNEL);
if (client == NULL)
EXPORT_SYMBOL(neigh_rand_reach_time);
+static bool neigh_del(struct neighbour *n, __u8 state,
+ struct neighbour __rcu **np, struct neigh_table *tbl)
+{
+ bool retval = false;
+
+ write_lock(&n->lock);
+ if (refcount_read(&n->refcnt) == 1 && !(n->nud_state & state)) {
+ struct neighbour *neigh;
+
+ neigh = rcu_dereference_protected(n->next,
+ lockdep_is_held(&tbl->lock));
+ rcu_assign_pointer(*np, neigh);
+ n->dead = 1;
+ retval = true;
+ }
+ write_unlock(&n->lock);
+ if (retval)
+ neigh_cleanup_and_release(n);
+ return retval;
+}
+
+bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl)
+{
+ struct neigh_hash_table *nht;
+ void *pkey = ndel->primary_key;
+ u32 hash_val;
+ struct neighbour *n;
+ struct neighbour __rcu **np;
+
+ nht = rcu_dereference_protected(tbl->nht,
+ lockdep_is_held(&tbl->lock));
+ hash_val = tbl->hash(pkey, ndel->dev, nht->hash_rnd);
+ hash_val = hash_val >> (32 - nht->hash_shift);
+
+ np = &nht->hash_buckets[hash_val];
+ while ((n = rcu_dereference_protected(*np,
+ lockdep_is_held(&tbl->lock)))) {
+ if (n == ndel)
+ return neigh_del(n, 0, np, tbl);
+ np = &n->next;
+ }
+ return false;
+}
+
static int neigh_forced_gc(struct neigh_table *tbl)
{
int shrunk = 0;
* - nobody refers to it.
* - it is not permanent
*/
- write_lock(&n->lock);
- if (atomic_read(&n->refcnt) == 1 &&
- !(n->nud_state & NUD_PERMANENT)) {
- rcu_assign_pointer(*np,
- rcu_dereference_protected(n->next,
- lockdep_is_held(&tbl->lock)));
- n->dead = 1;
- shrunk = 1;
- write_unlock(&n->lock);
- neigh_cleanup_and_release(n);
+ if (neigh_del(n, NUD_PERMANENT, np, tbl)) {
+ shrunk = 1;
continue;
}
- write_unlock(&n->lock);
np = &n->next;
}
}
neigh_del_timer(n);
n->dead = 1;
- if (atomic_read(&n->refcnt) != 1) {
+ if (refcount_read(&n->refcnt) != 1) {
/* The most unpleasant situation.
We must destroy neighbour entry,
but someone still uses it.
NEIGH_CACHE_STAT_INC(tbl, allocs);
n->tbl = tbl;
- atomic_set(&n->refcnt, 1);
+ refcount_set(&n->refcnt, 1);
n->dead = 1;
out:
return n;
static void neigh_get_hash_rnd(u32 *x)
{
- get_random_bytes(x, sizeof(*x));
- *x |= 1;
+ *x = get_random_u32() | 1;
}
static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift)
rcu_read_lock_bh();
n = __neigh_lookup_noref(tbl, pkey, dev);
if (n) {
- if (!atomic_inc_not_zero(&n->refcnt))
+ if (!refcount_inc_not_zero(&n->refcnt))
n = NULL;
NEIGH_CACHE_STAT_INC(tbl, hits);
}
n = rcu_dereference_bh(n->next)) {
if (!memcmp(n->primary_key, pkey, key_len) &&
net_eq(dev_net(n->dev), net)) {
- if (!atomic_inc_not_zero(&n->refcnt))
+ if (!refcount_inc_not_zero(&n->refcnt))
n = NULL;
NEIGH_CACHE_STAT_INC(tbl, hits);
break;
static inline void neigh_parms_put(struct neigh_parms *parms)
{
- if (atomic_dec_and_test(&parms->refcnt))
+ if (refcount_dec_and_test(&parms->refcnt))
neigh_parms_destroy(parms);
}
if (time_before(n->used, n->confirmed))
n->used = n->confirmed;
- if (atomic_read(&n->refcnt) == 1 &&
+ if (refcount_read(&n->refcnt) == 1 &&
(state == NUD_FAILED ||
time_after(jiffies, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) {
*np = n->next;
p = kmemdup(&tbl->parms, sizeof(*p), GFP_KERNEL);
if (p) {
p->tbl = tbl;
- atomic_set(&p->refcnt, 1);
+ refcount_set(&p->refcnt, 1);
p->reachable_time =
neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME));
dev_hold(dev);
INIT_LIST_HEAD(&tbl->parms_list);
list_add(&tbl->parms.list, &tbl->parms_list);
write_pnet(&tbl->parms.net, &init_net);
- atomic_set(&tbl->parms.refcnt, 1);
+ refcount_set(&tbl->parms.refcnt, 1);
tbl->parms.reachable_time =
neigh_rand_reach_time(NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME));
NEIGH_UPDATE_F_OVERRIDE |
NEIGH_UPDATE_F_ADMIN,
NETLINK_CB(skb).portid);
+ write_lock_bh(&tbl->lock);
neigh_release(neigh);
+ neigh_remove_one(neigh, tbl);
+ write_unlock_bh(&tbl->lock);
out:
return err;
if ((parms->dev &&
nla_put_u32(skb, NDTPA_IFINDEX, parms->dev->ifindex)) ||
- nla_put_u32(skb, NDTPA_REFCNT, atomic_read(&parms->refcnt)) ||
+ nla_put_u32(skb, NDTPA_REFCNT, refcount_read(&parms->refcnt)) ||
nla_put_u32(skb, NDTPA_QUEUE_LENBYTES,
NEIGH_VAR(parms, QUEUE_LEN_BYTES)) ||
/* approximative value for deprecated QUEUE_LEN (in packets) */
ci.ndm_used = jiffies_to_clock_t(now - neigh->used);
ci.ndm_confirmed = jiffies_to_clock_t(now - neigh->confirmed);
ci.ndm_updated = jiffies_to_clock_t(now - neigh->updated);
- ci.ndm_refcnt = atomic_read(&neigh->refcnt) - 1;
+ ci.ndm_refcnt = refcount_read(&neigh->refcnt) - 1;
read_unlock_bh(&neigh->lock);
if (nla_put_u32(skb, NDA_PROBES, atomic_read(&neigh->probes)) ||
#include <net/ip_tunnels.h>
#include <net/l3mdev.h>
+#include "fib_lookup.h"
+
#define RT_FL_TOS(oldflp4) \
((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
build_sk_flow_key(fl4, sk);
}
-static inline void rt_free(struct rtable *rt)
-{
- call_rcu(&rt->dst.rcu_head, dst_rcu_free);
-}
-
static DEFINE_SPINLOCK(fnhe_lock);
static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
rt = rcu_dereference(fnhe->fnhe_rth_input);
if (rt) {
RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
- rt_free(rt);
+ dst_dev_put(&rt->dst);
+ dst_release(&rt->dst);
}
rt = rcu_dereference(fnhe->fnhe_rth_output);
if (rt) {
RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
- rt_free(rt);
+ dst_dev_put(&rt->dst);
+ dst_release(&rt->dst);
}
}
}
static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
- __be32 daddr)
+ __be32 daddr, const bool do_cache)
{
bool ret = false;
if (!rt->rt_gateway)
rt->rt_gateway = daddr;
- if (!(rt->dst.flags & DST_NOCACHE)) {
+ if (do_cache) {
+ dst_hold(&rt->dst);
rcu_assign_pointer(*porig, rt);
- if (orig)
- rt_free(orig);
+ if (orig) {
+ dst_dev_put(&orig->dst);
+ dst_release(&orig->dst);
+ }
ret = true;
}
}
orig = *p;
+ /* hold dst before doing cmpxchg() to avoid race condition
+ * on this dst
+ */
+ dst_hold(&rt->dst);
prev = cmpxchg(p, orig, rt);
if (prev == orig) {
- if (orig)
- rt_free(orig);
- } else
+ if (orig) {
+ dst_dev_put(&orig->dst);
+ dst_release(&orig->dst);
+ }
+ } else {
+ dst_release(&rt->dst);
ret = false;
+ }
return ret;
}
static void ipv4_dst_destroy(struct dst_entry *dst)
{
+ struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
struct rtable *rt = (struct rtable *) dst;
+ if (p != &dst_default_metrics && atomic_dec_and_test(&p->refcnt))
+ kfree(p);
+
if (!list_empty(&rt->rt_uncached)) {
struct uncached_list *ul = rt->rt_uncached_list;
static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
const struct fib_result *res,
struct fib_nh_exception *fnhe,
- struct fib_info *fi, u16 type, u32 itag)
+ struct fib_info *fi, u16 type, u32 itag,
+ const bool do_cache)
{
bool cached = false;
rt->rt_gateway = nh->nh_gw;
rt->rt_uses_gateway = 1;
}
- dst_init_metrics(&rt->dst, fi->fib_metrics, true);
+ dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
+ if (fi->fib_metrics != &dst_default_metrics) {
+ rt->dst._metrics |= DST_METRICS_REFCOUNTED;
+ atomic_inc(&fi->fib_metrics->refcnt);
+ }
#ifdef CONFIG_IP_ROUTE_CLASSID
rt->dst.tclassid = nh->nh_tclassid;
#endif
rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
if (unlikely(fnhe))
- cached = rt_bind_exception(rt, fnhe, daddr);
- else if (!(rt->dst.flags & DST_NOCACHE))
+ cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
+ else if (do_cache)
cached = rt_cache_route(nh, rt);
if (unlikely(!cached)) {
/* Routes we intend to cache in nexthop exception or
* However, if we are unsuccessful at storing this
* route into the cache we really need to set it.
*/
- rt->dst.flags |= DST_NOCACHE;
if (!rt->rt_gateway)
rt->rt_gateway = daddr;
rt_add_uncached_list(rt);
struct rtable *rt;
rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
- (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
+ (will_cache ? 0 : DST_HOST) |
(nopolicy ? DST_NOPOLICY : 0) |
(noxfrm ? DST_NOXFRM : 0));
rth->dst.input = ip_forward;
- rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
+ rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
+ do_cache);
set_lwt_redirect(rth);
skb_dst_set(skb, &rth->dst);
out:
*/
static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
- u8 tos, struct net_device *dev)
+ u8 tos, struct net_device *dev,
+ struct fib_result *res)
{
- struct fib_result res;
struct in_device *in_dev = __in_dev_get_rcu(dev);
struct ip_tunnel_info *tun_info;
struct flowi4 fl4;
if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
goto martian_source;
- res.fi = NULL;
- res.table = NULL;
+ res->fi = NULL;
+ res->table = NULL;
if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
goto brd_input;
fl4.daddr = daddr;
fl4.saddr = saddr;
fl4.flowi4_uid = sock_net_uid(net, NULL);
- err = fib_lookup(net, &fl4, &res, 0);
+ err = fib_lookup(net, &fl4, res, 0);
if (err != 0) {
if (!IN_DEV_FORWARD(in_dev))
err = -EHOSTUNREACH;
goto no_route;
}
- if (res.type == RTN_BROADCAST)
+ if (res->type == RTN_BROADCAST)
goto brd_input;
- if (res.type == RTN_LOCAL) {
+ if (res->type == RTN_LOCAL) {
err = fib_validate_source(skb, saddr, daddr, tos,
0, dev, in_dev, &itag);
if (err < 0)
err = -EHOSTUNREACH;
goto no_route;
}
- if (res.type != RTN_UNICAST)
+ if (res->type != RTN_UNICAST)
goto martian_destination;
- err = ip_mkroute_input(skb, &res, in_dev, daddr, saddr, tos);
+ err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos);
out: return err;
brd_input:
goto martian_source;
}
flags |= RTCF_BROADCAST;
- res.type = RTN_BROADCAST;
+ res->type = RTN_BROADCAST;
RT_CACHE_STAT_INC(in_brd);
local_input:
do_cache = false;
- if (res.fi) {
+ if (res->fi) {
if (!itag) {
- rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
+ rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
if (rt_cache_valid(rth)) {
skb_dst_set_noref(skb, &rth->dst);
err = 0;
}
rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
- flags | RTCF_LOCAL, res.type,
+ flags | RTCF_LOCAL, res->type,
IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
if (!rth)
goto e_nobufs;
rth->dst.tclassid = itag;
#endif
rth->rt_is_input = 1;
- if (res.table)
- rth->rt_table_id = res.table->tb_id;
+ if (res->table)
+ rth->rt_table_id = res->table->tb_id;
RT_CACHE_STAT_INC(in_slow_tot);
- if (res.type == RTN_UNREACHABLE) {
+ if (res->type == RTN_UNREACHABLE) {
rth->dst.input= ip_error;
rth->dst.error= -err;
rth->rt_flags &= ~RTCF_LOCAL;
}
if (do_cache) {
- struct fib_nh *nh = &FIB_RES_NH(res);
+ struct fib_nh *nh = &FIB_RES_NH(*res);
rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
rth->dst.input = lwtunnel_input;
}
- if (unlikely(!rt_cache_route(nh, rth))) {
- rth->dst.flags |= DST_NOCACHE;
+ if (unlikely(!rt_cache_route(nh, rth)))
rt_add_uncached_list(rth);
- }
}
skb_dst_set(skb, &rth->dst);
err = 0;
no_route:
RT_CACHE_STAT_INC(in_no_route);
- res.type = RTN_UNREACHABLE;
- res.fi = NULL;
- res.table = NULL;
+ res->type = RTN_UNREACHABLE;
+ res->fi = NULL;
+ res->table = NULL;
goto local_input;
/*
int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
u8 tos, struct net_device *dev)
{
- int res;
+ struct fib_result res;
+ int err;
tos &= IPTOS_RT_MASK;
rcu_read_lock();
+ err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
+ rcu_read_unlock();
+ return err;
+}
+EXPORT_SYMBOL(ip_route_input_noref);
+
+/* called with rcu_read_lock held */
+int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+ u8 tos, struct net_device *dev, struct fib_result *res)
+{
/* Multicast recognition logic is moved from route cache to here.
The problem was that too many Ethernet cards have broken/missing
hardware multicast filters :-( As result the host on multicasting
if (ipv4_is_multicast(daddr)) {
struct in_device *in_dev = __in_dev_get_rcu(dev);
int our = 0;
+ int err = -EINVAL;
if (in_dev)
our = ip_check_mc_rcu(in_dev, daddr, saddr,
ip_hdr(skb)->protocol);
}
- res = -EINVAL;
if (our
#ifdef CONFIG_IP_MROUTE
||
IN_DEV_MFORWARD(in_dev))
#endif
) {
- res = ip_route_input_mc(skb, daddr, saddr,
+ err = ip_route_input_mc(skb, daddr, saddr,
tos, dev, our);
}
- rcu_read_unlock();
- return res;
+ return err;
}
- res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
- rcu_read_unlock();
- return res;
+
+ return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
}
-EXPORT_SYMBOL(ip_route_input_noref);
/* called with rcu_read_lock() */
static struct rtable *__mkroute_output(const struct fib_result *res,
rth = rcu_dereference(*prth);
rt_cache:
- if (rt_cache_valid(rth)) {
- dst_hold(&rth->dst);
+ if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
return rth;
- }
}
add:
#endif
}
- rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
+ rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
set_lwt_redirect(rth);
return rth;
* Major route resolver routine.
*/
-struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
- const struct sk_buff *skb)
+struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
+ const struct sk_buff *skb)
{
- struct net_device *dev_out = NULL;
__u8 tos = RT_FL_TOS(fl4);
- unsigned int flags = 0;
struct fib_result res;
struct rtable *rth;
- int orig_oif;
- int err = -ENETUNREACH;
res.tclassid = 0;
res.fi = NULL;
res.table = NULL;
- orig_oif = fl4->flowi4_oif;
-
fl4->flowi4_iif = LOOPBACK_IFINDEX;
fl4->flowi4_tos = tos & IPTOS_RT_MASK;
fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
rcu_read_lock();
+ rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
+ rcu_read_unlock();
+
+ return rth;
+}
+EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
+
+struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
+ struct fib_result *res,
+ const struct sk_buff *skb)
+{
+ struct net_device *dev_out = NULL;
+ int orig_oif = fl4->flowi4_oif;
+ unsigned int flags = 0;
+ struct rtable *rth;
+ int err = -ENETUNREACH;
+
if (fl4->saddr) {
rth = ERR_PTR(-EINVAL);
if (ipv4_is_multicast(fl4->saddr) ||
fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
dev_out = net->loopback_dev;
fl4->flowi4_oif = LOOPBACK_IFINDEX;
- res.type = RTN_LOCAL;
+ res->type = RTN_LOCAL;
flags |= RTCF_LOCAL;
goto make_route;
}
- err = fib_lookup(net, fl4, &res, 0);
+ err = fib_lookup(net, fl4, res, 0);
if (err) {
- res.fi = NULL;
- res.table = NULL;
+ res->fi = NULL;
+ res->table = NULL;
if (fl4->flowi4_oif &&
(ipv4_is_multicast(fl4->daddr) ||
!netif_index_is_l3_master(net, fl4->flowi4_oif))) {
if (fl4->saddr == 0)
fl4->saddr = inet_select_addr(dev_out, 0,
RT_SCOPE_LINK);
- res.type = RTN_UNICAST;
+ res->type = RTN_UNICAST;
goto make_route;
}
rth = ERR_PTR(err);
goto out;
}
- if (res.type == RTN_LOCAL) {
+ if (res->type == RTN_LOCAL) {
if (!fl4->saddr) {
- if (res.fi->fib_prefsrc)
- fl4->saddr = res.fi->fib_prefsrc;
+ if (res->fi->fib_prefsrc)
+ fl4->saddr = res->fi->fib_prefsrc;
else
fl4->saddr = fl4->daddr;
}
/* L3 master device is the loopback for that domain */
- dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(res)) ? :
+ dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
net->loopback_dev;
fl4->flowi4_oif = dev_out->ifindex;
flags |= RTCF_LOCAL;
goto make_route;
}
- fib_select_path(net, &res, fl4, skb);
+ fib_select_path(net, res, fl4, skb);
- dev_out = FIB_RES_DEV(res);
+ dev_out = FIB_RES_DEV(*res);
fl4->flowi4_oif = dev_out->ifindex;
make_route:
- rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
+ rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
out:
- rcu_read_unlock();
return rth;
}
-EXPORT_SYMBOL_GPL(__ip_route_output_key_hash);
static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
{
new->input = dst_discard;
new->output = dst_discard_out;
- new->dev = ort->dst.dev;
+ new->dev = net->loopback_dev;
if (new->dev)
dev_hold(new->dev);
rt->rt_uses_gateway = ort->rt_uses_gateway;
INIT_LIST_HEAD(&rt->rt_uncached);
- dst_free(new);
}
dst_release(dst_orig);
}
EXPORT_SYMBOL_GPL(ip_route_output_flow);
+/* called with rcu_read_lock held */
static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
- u32 seq, int event)
+ u32 seq)
{
struct rtable *rt = skb_rtable(skb);
struct rtmsg *r;
u32 error;
u32 metrics[RTAX_MAX];
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), 0);
+ nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
if (!nlh)
return -EMSGSIZE;
struct net *net = sock_net(in_skb->sk);
struct rtmsg *rtm;
struct nlattr *tb[RTA_MAX+1];
+ struct fib_result res = {};
struct rtable *rt = NULL;
struct flowi4 fl4;
__be32 dst = 0;
fl4.flowi4_mark = mark;
fl4.flowi4_uid = uid;
+ rcu_read_lock();
+
if (iif) {
struct net_device *dev;
- dev = __dev_get_by_index(net, iif);
+ dev = dev_get_by_index_rcu(net, iif);
if (!dev) {
err = -ENODEV;
goto errout_free;
skb->protocol = htons(ETH_P_IP);
skb->dev = dev;
skb->mark = mark;
- err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
+ err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
+ dev, &res);
rt = skb_rtable(skb);
if (err == 0 && rt->dst.error)
err = -rt->dst.error;
} else {
- rt = ip_route_output_key(net, &fl4);
-
+ rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
err = 0;
if (IS_ERR(rt))
err = PTR_ERR(rt);
if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
table_id = rt->rt_table_id;
- err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
- NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
- RTM_NEWROUTE);
+ if (rtm->rtm_flags & RTM_F_FIB_MATCH)
+ err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
+ nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
+ rt->rt_type, res.prefix, res.prefixlen,
+ fl4.flowi4_tos, res.fi, 0);
+ else
+ err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
+ NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
if (err < 0)
goto errout_free;
+ rcu_read_unlock();
+
err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
errout:
return err;
errout_free:
+ rcu_read_unlock();
kfree_skb(skb);
goto errout;
}
{
atomic_set(&net->ipv4.rt_genid, 0);
atomic_set(&net->fnhe_genid, 0);
- get_random_bytes(&net->ipv4.dev_addr_genid,
- sizeof(net->ipv4.dev_addr_genid));
+ atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
return 0;
}