drivers/net/sfc/rx.c

   1 /****************************************************************************
   2  * Driver for Solarflare Solarstorm network controllers and boards
   3  * Copyright 2005-2006 Fen Systems Ltd.
   4  * Copyright 2005-2008 Solarflare Communications Inc.
   5  *
   6  * This program is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 as published
   8  * by the Free Software Foundation, incorporated herein by reference.
   9  */
  10
  11 #include <linux/socket.h>
  12 #include <linux/in.h>
  13 #include <linux/ip.h>
  14 #include <linux/tcp.h>
  15 #include <linux/udp.h>
  16 #include <net/ip.h>
  17 #include <net/checksum.h>
  18 #include "net_driver.h"
  19 #include "rx.h"
  20 #include "efx.h"
  21 #include "falcon.h"
  22 #include "workarounds.h"
  23
  24 /* Number of RX descriptors pushed at once. */
  25 #define EFX_RX_BATCH  8
  26
  27 /* Size of buffer allocated for skb header area. */
  28 #define EFX_SKB_HEADERS  64u
  29
  30 /*
  31  * rx_alloc_method - RX buffer allocation method
  32  *
  33  * This driver supports two methods for allocating and using RX buffers:
  34  * each RX buffer may be backed by an skb or by an order-n page.
  35  *
  36  * When LRO is in use then the second method has a lower overhead,
  37  * since we don't have to allocate then free skbs on reassembled frames.
  38  *
  39  * Values:
  40  *   - RX_ALLOC_METHOD_AUTO = 0
  41  *   - RX_ALLOC_METHOD_SKB  = 1
  42  *   - RX_ALLOC_METHOD_PAGE = 2
  43  *
  44  * The heuristic for %RX_ALLOC_METHOD_AUTO is a simple hysteresis count
  45  * controlled by the parameters below.
  46  *
  47  *   - Since pushing and popping descriptors are separated by the rx_queue
  48  *     size, so the watermarks should be ~rxd_size.
  49  *   - The performance win by using page-based allocation for LRO is less
  50  *     than the performance hit of using page-based allocation of non-LRO,
  51  *     so the watermarks should reflect this.
  52  *
  53  * Per channel we maintain a single variable, updated by each channel:
  54  *
  55  *   rx_alloc_level += (lro_performed ? RX_ALLOC_FACTOR_LRO :
  56  *                      RX_ALLOC_FACTOR_SKB)
  57  * Per NAPI poll interval, we constrain rx_alloc_level to 0..MAX (which
  58  * limits the hysteresis), and update the allocation strategy:
  59  *
  60  *   rx_alloc_method = (rx_alloc_level > RX_ALLOC_LEVEL_LRO ?
  61  *                      RX_ALLOC_METHOD_PAGE : RX_ALLOC_METHOD_SKB)
  62  */
  63 static int rx_alloc_method = RX_ALLOC_METHOD_PAGE;
  64
  65 #define RX_ALLOC_LEVEL_LRO 0x2000
  66 #define RX_ALLOC_LEVEL_MAX 0x3000
  67 #define RX_ALLOC_FACTOR_LRO 1
  68 #define RX_ALLOC_FACTOR_SKB (-2)
  69
  70 /* This is the percentage fill level below which new RX descriptors
  71  * will be added to the RX descriptor ring.
  72  */
  73 static unsigned int rx_refill_threshold = 90;
  74
  75 /* This is the percentage fill level to which an RX queue will be refilled
  76  * when the "RX refill threshold" is reached.
  77  */
  78 static unsigned int rx_refill_limit = 95;
  79
  80 /*
  81  * RX maximum head room required.
  82  *
  83  * This must be at least 1 to prevent overflow and at least 2 to allow
  84  * pipelined receives.
  85  */
  86 #define EFX_RXD_HEAD_ROOM 2
  87
  88 /* Macros for zero-order pages (potentially) containing multiple RX buffers */
  89 #define RX_DATA_OFFSET(_data)                           \
  90         (((unsigned long) (_data)) & (PAGE_SIZE-1))
  91 #define RX_BUF_OFFSET(_rx_buf)                          \
  92         RX_DATA_OFFSET((_rx_buf)->data)
  93
  94 #define RX_PAGE_SIZE(_efx)                              \
  95         (PAGE_SIZE * (1u << (_efx)->rx_buffer_order))
  96
  97
  98 /**************************************************************************
  99  *
 100  * Linux generic LRO handling
 101  *
 102  **************************************************************************
 103  */
 104
 105 static int efx_lro_get_skb_hdr(struct sk_buff *skb, void **ip_hdr,
 106                                void **tcpudp_hdr, u64 *hdr_flags, void *priv)
 107 {
 108         struct efx_channel *channel = (struct efx_channel *)priv;
 109         struct iphdr *iph;
 110         struct tcphdr *th;
 111
 112         iph = (struct iphdr *)skb->data;
 113         if (skb->protocol != htons(ETH_P_IP) || iph->protocol != IPPROTO_TCP)
 114                 goto fail;
 115
 116         th = (struct tcphdr *)(skb->data + iph->ihl * 4);
 117
 118         *tcpudp_hdr = th;
 119         *ip_hdr = iph;
 120         *hdr_flags = LRO_IPV4 | LRO_TCP;
 121
 122         channel->rx_alloc_level += RX_ALLOC_FACTOR_LRO;
 123         return 0;
 124 fail:
 125         channel->rx_alloc_level += RX_ALLOC_FACTOR_SKB;
 126         return -1;
 127 }
 128
 129 static int efx_get_frag_hdr(struct skb_frag_struct *frag, void **mac_hdr,
 130                             void **ip_hdr, void **tcpudp_hdr, u64 *hdr_flags,
 131                             void *priv)
 132 {
 133         struct efx_channel *channel = (struct efx_channel *)priv;
 134         struct ethhdr *eh;
 135         struct iphdr *iph;
 136
 137         /* We support EtherII and VLAN encapsulated IPv4 */
 138         eh = (struct ethhdr *)(page_address(frag->page) + frag->page_offset);
 139         *mac_hdr = eh;
 140
 141         if (eh->h_proto == htons(ETH_P_IP)) {
 142                 iph = (struct iphdr *)(eh + 1);
 143         } else {
 144                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)eh;
 145                 if (veh->h_vlan_encapsulated_proto != htons(ETH_P_IP))
 146                         goto fail;
 147
 148                 iph = (struct iphdr *)(veh + 1);
 149         }
 150         *ip_hdr = iph;
 151
 152         /* We can only do LRO over TCP */
 153         if (iph->protocol != IPPROTO_TCP)
 154                 goto fail;
 155
 156         *hdr_flags = LRO_IPV4 | LRO_TCP;
 157         *tcpudp_hdr = (struct tcphdr *)((u8 *) iph + iph->ihl * 4);
 158
 159         channel->rx_alloc_level += RX_ALLOC_FACTOR_LRO;
 160         return 0;
 161  fail:
 162         channel->rx_alloc_level += RX_ALLOC_FACTOR_SKB;
 163         return -1;
 164 }
 165
 166 int efx_lro_init(struct net_lro_mgr *lro_mgr, struct efx_nic *efx)
 167 {
 168         size_t s = sizeof(struct net_lro_desc) * EFX_MAX_LRO_DESCRIPTORS;
 169         struct net_lro_desc *lro_arr;
 170
 171         /* Allocate the LRO descriptors structure */
 172         lro_arr = kzalloc(s, GFP_KERNEL);
 173         if (lro_arr == NULL)
 174                 return -ENOMEM;
 175
 176         lro_mgr->lro_arr = lro_arr;
 177         lro_mgr->max_desc = EFX_MAX_LRO_DESCRIPTORS;
 178         lro_mgr->max_aggr = EFX_MAX_LRO_AGGR;
 179         lro_mgr->frag_align_pad = EFX_PAGE_SKB_ALIGN;
 180
 181         lro_mgr->get_skb_header = efx_lro_get_skb_hdr;
 182         lro_mgr->get_frag_header = efx_get_frag_hdr;
 183         lro_mgr->dev = efx->net_dev;
 184
 185         lro_mgr->features = LRO_F_NAPI;
 186
 187         /* We can pass packets up with the checksum intact */
 188         lro_mgr->ip_summed = CHECKSUM_UNNECESSARY;
 189
 190         lro_mgr->ip_summed_aggr = CHECKSUM_UNNECESSARY;
 191
 192         return 0;
 193 }
 194
 195 void efx_lro_fini(struct net_lro_mgr *lro_mgr)
 196 {
 197         kfree(lro_mgr->lro_arr);
 198         lro_mgr->lro_arr = NULL;
 199 }
 200
 201 /**
 202  * efx_init_rx_buffer_skb - create new RX buffer using skb-based allocation
 203  *
 204  * @rx_queue:           Efx RX queue
 205  * @rx_buf:             RX buffer structure to populate
 206  *
 207  * This allocates memory for a new receive buffer, maps it for DMA,
 208  * and populates a struct efx_rx_buffer with the relevant
 209  * information.  Return a negative error code or 0 on success.
 210  */
 211 static inline int efx_init_rx_buffer_skb(struct efx_rx_queue *rx_queue,
 212                                          struct efx_rx_buffer *rx_buf)
 213 {
 214         struct efx_nic *efx = rx_queue->efx;
 215         struct net_device *net_dev = efx->net_dev;
 216         int skb_len = efx->rx_buffer_len;
 217
 218         rx_buf->skb = netdev_alloc_skb(net_dev, skb_len);
 219         if (unlikely(!rx_buf->skb))
 220                 return -ENOMEM;
 221
 222         /* Adjust the SKB for padding and checksum */
 223         skb_reserve(rx_buf->skb, NET_IP_ALIGN);
 224         rx_buf->len = skb_len - NET_IP_ALIGN;
 225         rx_buf->data = (char *)rx_buf->skb->data;
 226         rx_buf->skb->ip_summed = CHECKSUM_UNNECESSARY;
 227
 228         rx_buf->dma_addr = pci_map_single(efx->pci_dev,
 229                                           rx_buf->data, rx_buf->len,
 230                                           PCI_DMA_FROMDEVICE);
 231
 232         if (unlikely(pci_dma_mapping_error(rx_buf->dma_addr))) {
 233                 dev_kfree_skb_any(rx_buf->skb);
 234                 rx_buf->skb = NULL;
 235                 return -EIO;
 236         }
 237
 238         return 0;
 239 }
 240
 241 /**
 242  * efx_init_rx_buffer_page - create new RX buffer using page-based allocation
 243  *
 244  * @rx_queue:           Efx RX queue
 245  * @rx_buf:             RX buffer structure to populate
 246  *
 247  * This allocates memory for a new receive buffer, maps it for DMA,
 248  * and populates a struct efx_rx_buffer with the relevant
 249  * information.  Return a negative error code or 0 on success.
 250  */
 251 static inline int efx_init_rx_buffer_page(struct efx_rx_queue *rx_queue,
 252                                           struct efx_rx_buffer *rx_buf)
 253 {
 254         struct efx_nic *efx = rx_queue->efx;
 255         int bytes, space, offset;
 256
 257         bytes = efx->rx_buffer_len - EFX_PAGE_IP_ALIGN;
 258
 259         /* If there is space left in the previously allocated page,
 260          * then use it. Otherwise allocate a new one */
 261         rx_buf->page = rx_queue->buf_page;
 262         if (rx_buf->page == NULL) {
 263                 dma_addr_t dma_addr;
 264
 265                 rx_buf->page = alloc_pages(__GFP_COLD | __GFP_COMP | GFP_ATOMIC,
 266                                            efx->rx_buffer_order);
 267                 if (unlikely(rx_buf->page == NULL))
 268                         return -ENOMEM;
 269
 270                 dma_addr = pci_map_page(efx->pci_dev, rx_buf->page,
 271                                         0, RX_PAGE_SIZE(efx),
 272                                         PCI_DMA_FROMDEVICE);
 273
 274                 if (unlikely(pci_dma_mapping_error(dma_addr))) {
 275                         __free_pages(rx_buf->page, efx->rx_buffer_order);
 276                         rx_buf->page = NULL;
 277                         return -EIO;
 278                 }
 279
 280                 rx_queue->buf_page = rx_buf->page;
 281                 rx_queue->buf_dma_addr = dma_addr;
 282                 rx_queue->buf_data = ((char *) page_address(rx_buf->page) +
 283                                       EFX_PAGE_IP_ALIGN);
 284         }
 285
 286         offset = RX_DATA_OFFSET(rx_queue->buf_data);
 287         rx_buf->len = bytes;
 288         rx_buf->dma_addr = rx_queue->buf_dma_addr + offset;
 289         rx_buf->data = rx_queue->buf_data;
 290
 291         /* Try to pack multiple buffers per page */
 292         if (efx->rx_buffer_order == 0) {
 293                 /* The next buffer starts on the next 512 byte boundary */
 294                 rx_queue->buf_data += ((bytes + 0x1ff) & ~0x1ff);
 295                 offset += ((bytes + 0x1ff) & ~0x1ff);
 296
 297                 space = RX_PAGE_SIZE(efx) - offset;
 298                 if (space >= bytes) {
 299                         /* Refs dropped on kernel releasing each skb */
 300                         get_page(rx_queue->buf_page);
 301                         goto out;
 302                 }
 303         }
 304
 305         /* This is the final RX buffer for this page, so mark it for
 306          * unmapping */
 307         rx_queue->buf_page = NULL;
 308         rx_buf->unmap_addr = rx_queue->buf_dma_addr;
 309
 310  out:
 311         return 0;
 312 }
 313
 314 /* This allocates memory for a new receive buffer, maps it for DMA,
 315  * and populates a struct efx_rx_buffer with the relevant
 316  * information.
 317  */
 318 static inline int efx_init_rx_buffer(struct efx_rx_queue *rx_queue,
 319                                      struct efx_rx_buffer *new_rx_buf)
 320 {
 321         int rc = 0;
 322
 323         if (rx_queue->channel->rx_alloc_push_pages) {
 324                 new_rx_buf->skb = NULL;
 325                 rc = efx_init_rx_buffer_page(rx_queue, new_rx_buf);
 326                 rx_queue->alloc_page_count++;
 327         } else {
 328                 new_rx_buf->page = NULL;
 329                 rc = efx_init_rx_buffer_skb(rx_queue, new_rx_buf);
 330                 rx_queue->alloc_skb_count++;
 331         }
 332
 333         if (unlikely(rc < 0))
 334                 EFX_LOG_RL(rx_queue->efx, "%s RXQ[%d] =%d\n", __func__,
 335                            rx_queue->queue, rc);
 336         return rc;
 337 }
 338
 339 static inline void efx_unmap_rx_buffer(struct efx_nic *efx,
 340                                        struct efx_rx_buffer *rx_buf)
 341 {
 342         if (rx_buf->page) {
 343                 EFX_BUG_ON_PARANOID(rx_buf->skb);
 344                 if (rx_buf->unmap_addr) {
 345                         pci_unmap_page(efx->pci_dev, rx_buf->unmap_addr,
 346                                        RX_PAGE_SIZE(efx), PCI_DMA_FROMDEVICE);
 347                         rx_buf->unmap_addr = 0;
 348                 }
 349         } else if (likely(rx_buf->skb)) {
 350                 pci_unmap_single(efx->pci_dev, rx_buf->dma_addr,
 351                                  rx_buf->len, PCI_DMA_FROMDEVICE);
 352         }
 353 }
 354
 355 static inline void efx_free_rx_buffer(struct efx_nic *efx,
 356                                       struct efx_rx_buffer *rx_buf)
 357 {
 358         if (rx_buf->page) {
 359                 __free_pages(rx_buf->page, efx->rx_buffer_order);
 360                 rx_buf->page = NULL;
 361         } else if (likely(rx_buf->skb)) {
 362                 dev_kfree_skb_any(rx_buf->skb);
 363                 rx_buf->skb = NULL;
 364         }
 365 }
 366
 367 static inline void efx_fini_rx_buffer(struct efx_rx_queue *rx_queue,
 368                                       struct efx_rx_buffer *rx_buf)
 369 {
 370         efx_unmap_rx_buffer(rx_queue->efx, rx_buf);
 371         efx_free_rx_buffer(rx_queue->efx, rx_buf);
 372 }
 373
 374 /**
 375  * efx_fast_push_rx_descriptors - push new RX descriptors quickly
 376  * @rx_queue:           RX descriptor queue
 377  * @retry:              Recheck the fill level
 378  * This will aim to fill the RX descriptor queue up to
 379  * @rx_queue->@fast_fill_limit. If there is insufficient atomic
 380  * memory to do so, the caller should retry.
 381  */
 382 static int __efx_fast_push_rx_descriptors(struct efx_rx_queue *rx_queue,
 383                                           int retry)
 384 {
 385         struct efx_rx_buffer *rx_buf;
 386         unsigned fill_level, index;
 387         int i, space, rc = 0;
 388
 389         /* Calculate current fill level.  Do this outside the lock,
 390          * because most of the time we'll end up not wanting to do the
 391          * fill anyway.
 392          */
 393         fill_level = (rx_queue->added_count - rx_queue->removed_count);
 394         EFX_BUG_ON_PARANOID(fill_level >
 395                             rx_queue->efx->type->rxd_ring_mask + 1);
 396
 397         /* Don't fill if we don't need to */
 398         if (fill_level >= rx_queue->fast_fill_trigger)
 399                 return 0;
 400
 401         /* Record minimum fill level */
 402         if (unlikely(fill_level < rx_queue->min_fill))
 403                 if (fill_level)
 404                         rx_queue->min_fill = fill_level;
 405
 406         /* Acquire RX add lock.  If this lock is contended, then a fast
 407          * fill must already be in progress (e.g. in the refill
 408          * tasklet), so we don't need to do anything
 409          */
 410         if (!spin_trylock_bh(&rx_queue->add_lock))
 411                 return -1;
 412
 413  retry:
 414         /* Recalculate current fill level now that we have the lock */
 415         fill_level = (rx_queue->added_count - rx_queue->removed_count);
 416         EFX_BUG_ON_PARANOID(fill_level >
 417                             rx_queue->efx->type->rxd_ring_mask + 1);
 418         space = rx_queue->fast_fill_limit - fill_level;
 419         if (space < EFX_RX_BATCH)
 420                 goto out_unlock;
 421
 422         EFX_TRACE(rx_queue->efx, "RX queue %d fast-filling descriptor ring from"
 423                   " level %d to level %d using %s allocation\n",
 424                   rx_queue->queue, fill_level, rx_queue->fast_fill_limit,
 425                   rx_queue->channel->rx_alloc_push_pages ? "page" : "skb");
 426
 427         do {
 428                 for (i = 0; i < EFX_RX_BATCH; ++i) {
 429                         index = (rx_queue->added_count &
 430                                  rx_queue->efx->type->rxd_ring_mask);
 431                         rx_buf = efx_rx_buffer(rx_queue, index);
 432                         rc = efx_init_rx_buffer(rx_queue, rx_buf);
 433                         if (unlikely(rc))
 434                                 goto out;
 435                         ++rx_queue->added_count;
 436                 }
 437         } while ((space -= EFX_RX_BATCH) >= EFX_RX_BATCH);
 438
 439         EFX_TRACE(rx_queue->efx, "RX queue %d fast-filled descriptor ring "
 440                   "to level %d\n", rx_queue->queue,
 441                   rx_queue->added_count - rx_queue->removed_count);
 442
 443  out:
 444         /* Send write pointer to card. */
 445         falcon_notify_rx_desc(rx_queue);
 446
 447         /* If the fast fill is running inside from the refill tasklet, then
 448          * for SMP systems it may be running on a different CPU to
 449          * RX event processing, which means that the fill level may now be
 450          * out of date. */
 451         if (unlikely(retry && (rc == 0)))
 452                 goto retry;
 453
 454  out_unlock:
 455         spin_unlock_bh(&rx_queue->add_lock);
 456
 457         return rc;
 458 }
 459
 460 /**
 461  * efx_fast_push_rx_descriptors - push new RX descriptors quickly
 462  * @rx_queue:           RX descriptor queue
 463  *
 464  * This will aim to fill the RX descriptor queue up to
 465  * @rx_queue->@fast_fill_limit.  If there is insufficient memory to do so,
 466  * it will schedule a work item to immediately continue the fast fill
 467  */
 468 void efx_fast_push_rx_descriptors(struct efx_rx_queue *rx_queue)
 469 {
 470         int rc;
 471
 472         rc = __efx_fast_push_rx_descriptors(rx_queue, 0);
 473         if (unlikely(rc)) {
 474                 /* Schedule the work item to run immediately. The hope is
 475                  * that work is immediately pending to free some memory
 476                  * (e.g. an RX event or TX completion)
 477                  */
 478                 efx_schedule_slow_fill(rx_queue, 0);
 479         }
 480 }
 481
 482 void efx_rx_work(struct work_struct *data)
 483 {
 484         struct efx_rx_queue *rx_queue;
 485         int rc;
 486
 487         rx_queue = container_of(data, struct efx_rx_queue, work.work);
 488
 489         if (unlikely(!rx_queue->channel->enabled))
 490                 return;
 491
 492         EFX_TRACE(rx_queue->efx, "RX queue %d worker thread executing on CPU "
 493                   "%d\n", rx_queue->queue, raw_smp_processor_id());
 494
 495         ++rx_queue->slow_fill_count;
 496         /* Push new RX descriptors, allowing at least 1 jiffy for
 497          * the kernel to free some more memory. */
 498         rc = __efx_fast_push_rx_descriptors(rx_queue, 1);
 499         if (rc)
 500                 efx_schedule_slow_fill(rx_queue, 1);
 501 }
 502
 503 static inline void efx_rx_packet__check_len(struct efx_rx_queue *rx_queue,
 504                                             struct efx_rx_buffer *rx_buf,
 505                                             int len, int *discard,
 506                                             int *leak_packet)
 507 {
 508         struct efx_nic *efx = rx_queue->efx;
 509         unsigned max_len = rx_buf->len - efx->type->rx_buffer_padding;
 510
 511         if (likely(len <= max_len))
 512                 return;
 513
 514         /* The packet must be discarded, but this is only a fatal error
 515          * if the caller indicated it was
 516          */
 517         *discard = 1;
 518
 519         if ((len > rx_buf->len) && EFX_WORKAROUND_8071(efx)) {
 520                 EFX_ERR_RL(efx, " RX queue %d seriously overlength "
 521                            "RX event (0x%x > 0x%x+0x%x). Leaking\n",
 522                            rx_queue->queue, len, max_len,
 523                            efx->type->rx_buffer_padding);
 524                 /* If this buffer was skb-allocated, then the meta
 525                  * data at the end of the skb will be trashed. So
 526                  * we have no choice but to leak the fragment.
 527                  */
 528                 *leak_packet = (rx_buf->skb != NULL);
 529                 efx_schedule_reset(efx, RESET_TYPE_RX_RECOVERY);
 530         } else {
 531                 EFX_ERR_RL(efx, " RX queue %d overlength RX event "
 532                            "(0x%x > 0x%x)\n", rx_queue->queue, len, max_len);
 533         }
 534
 535         rx_queue->channel->n_rx_overlength++;
 536 }
 537
 538 /* Pass a received packet up through the generic LRO stack
 539  *
 540  * Handles driverlink veto, and passes the fragment up via
 541  * the appropriate LRO method
 542  */
 543 static inline void efx_rx_packet_lro(struct efx_channel *channel,
 544                                      struct efx_rx_buffer *rx_buf)
 545 {
 546         struct net_lro_mgr *lro_mgr = &channel->lro_mgr;
 547         void *priv = channel;
 548
 549         /* Pass the skb/page into the LRO engine */
 550         if (rx_buf->page) {
 551                 struct skb_frag_struct frags;
 552
 553                 frags.page = rx_buf->page;
 554                 frags.page_offset = RX_BUF_OFFSET(rx_buf);
 555                 frags.size = rx_buf->len;
 556
 557                 lro_receive_frags(lro_mgr, &frags, rx_buf->len,
 558                                   rx_buf->len, priv, 0);
 559
 560                 EFX_BUG_ON_PARANOID(rx_buf->skb);
 561                 rx_buf->page = NULL;
 562         } else {
 563                 EFX_BUG_ON_PARANOID(!rx_buf->skb);
 564
 565                 lro_receive_skb(lro_mgr, rx_buf->skb, priv);
 566                 rx_buf->skb = NULL;
 567         }
 568 }
 569
 570 /* Allocate and construct an SKB around a struct page.*/
 571 static inline struct sk_buff *efx_rx_mk_skb(struct efx_rx_buffer *rx_buf,
 572                                             struct efx_nic *efx,
 573                                             int hdr_len)
 574 {
 575         struct sk_buff *skb;
 576
 577         /* Allocate an SKB to store the headers */
 578         skb = netdev_alloc_skb(efx->net_dev, hdr_len + EFX_PAGE_SKB_ALIGN);
 579         if (unlikely(skb == NULL)) {
 580                 EFX_ERR_RL(efx, "RX out of memory for skb\n");
 581                 return NULL;
 582         }
 583
 584         EFX_BUG_ON_PARANOID(skb_shinfo(skb)->nr_frags);
 585         EFX_BUG_ON_PARANOID(rx_buf->len < hdr_len);
 586
 587         skb->ip_summed = CHECKSUM_UNNECESSARY;
 588         skb_reserve(skb, EFX_PAGE_SKB_ALIGN);
 589
 590         skb->len = rx_buf->len;
 591         skb->truesize = rx_buf->len + sizeof(struct sk_buff);
 592         memcpy(skb->data, rx_buf->data, hdr_len);
 593         skb->tail += hdr_len;
 594
 595         /* Append the remaining page onto the frag list */
 596         if (unlikely(rx_buf->len > hdr_len)) {
 597                 struct skb_frag_struct *frag = skb_shinfo(skb)->frags;
 598                 frag->page = rx_buf->page;
 599                 frag->page_offset = RX_BUF_OFFSET(rx_buf) + hdr_len;
 600                 frag->size = skb->len - hdr_len;
 601                 skb_shinfo(skb)->nr_frags = 1;
 602                 skb->data_len = frag->size;
 603         } else {
 604                 __free_pages(rx_buf->page, efx->rx_buffer_order);
 605                 skb->data_len = 0;
 606         }
 607
 608         /* Ownership has transferred from the rx_buf to skb */
 609         rx_buf->page = NULL;
 610
 611         /* Move past the ethernet header */
 612         skb->protocol = eth_type_trans(skb, efx->net_dev);
 613
 614         return skb;
 615 }
 616
 617 void efx_rx_packet(struct efx_rx_queue *rx_queue, unsigned int index,
 618                    unsigned int len, int checksummed, int discard)
 619 {
 620         struct efx_nic *efx = rx_queue->efx;
 621         struct efx_rx_buffer *rx_buf;
 622         int leak_packet = 0;
 623
 624         rx_buf = efx_rx_buffer(rx_queue, index);
 625         EFX_BUG_ON_PARANOID(!rx_buf->data);
 626         EFX_BUG_ON_PARANOID(rx_buf->skb && rx_buf->page);
 627         EFX_BUG_ON_PARANOID(!(rx_buf->skb || rx_buf->page));
 628
 629         /* This allows the refill path to post another buffer.
 630          * EFX_RXD_HEAD_ROOM ensures that the slot we are using
 631          * isn't overwritten yet.
 632          */
 633         rx_queue->removed_count++;
 634
 635         /* Validate the length encoded in the event vs the descriptor pushed */
 636         efx_rx_packet__check_len(rx_queue, rx_buf, len,
 637                                  &discard, &leak_packet);
 638
 639         EFX_TRACE(efx, "RX queue %d received id %x at %llx+%x %s%s\n",
 640                   rx_queue->queue, index,
 641                   (unsigned long long)rx_buf->dma_addr, len,
 642                   (checksummed ? " [SUMMED]" : ""),
 643                   (discard ? " [DISCARD]" : ""));
 644
 645         /* Discard packet, if instructed to do so */
 646         if (unlikely(discard)) {
 647                 if (unlikely(leak_packet))
 648                         rx_queue->channel->n_skbuff_leaks++;
 649                 else
 650                         /* We haven't called efx_unmap_rx_buffer yet,
 651                          * so fini the entire rx_buffer here */
 652                         efx_fini_rx_buffer(rx_queue, rx_buf);
 653                 return;
 654         }
 655
 656         /* Release card resources - assumes all RX buffers consumed in-order
 657          * per RX queue
 658          */
 659         efx_unmap_rx_buffer(efx, rx_buf);
 660
 661         /* Prefetch nice and early so data will (hopefully) be in cache by
 662          * the time we look at it.
 663          */
 664         prefetch(rx_buf->data);
 665
 666         /* Pipeline receives so that we give time for packet headers to be
 667          * prefetched into cache.
 668          */
 669         rx_buf->len = len;
 670         if (rx_queue->channel->rx_pkt)
 671                 __efx_rx_packet(rx_queue->channel,
 672                                 rx_queue->channel->rx_pkt,
 673                                 rx_queue->channel->rx_pkt_csummed);
 674         rx_queue->channel->rx_pkt = rx_buf;
 675         rx_queue->channel->rx_pkt_csummed = checksummed;
 676 }
 677
 678 /* Handle a received packet.  Second half: Touches packet payload. */
 679 void __efx_rx_packet(struct efx_channel *channel,
 680                      struct efx_rx_buffer *rx_buf, int checksummed)
 681 {
 682         struct efx_nic *efx = channel->efx;
 683         struct sk_buff *skb;
 684         int lro = efx->net_dev->features & NETIF_F_LRO;
 685
 686         if (rx_buf->skb) {
 687                 prefetch(skb_shinfo(rx_buf->skb));
 688
 689                 skb_put(rx_buf->skb, rx_buf->len);
 690
 691                 /* Move past the ethernet header. rx_buf->data still points
 692                  * at the ethernet header */
 693                 rx_buf->skb->protocol = eth_type_trans(rx_buf->skb,
 694                                                        efx->net_dev);
 695         }
 696
 697         /* Both our generic-LRO and SFC-SSR support skb and page based
 698          * allocation, but neither support switching from one to the
 699          * other on the fly. If we spot that the allocation mode has
 700          * changed, then flush the LRO state.
 701          */
 702         if (unlikely(channel->rx_alloc_pop_pages != (rx_buf->page != NULL))) {
 703                 efx_flush_lro(channel);
 704                 channel->rx_alloc_pop_pages = (rx_buf->page != NULL);
 705         }
 706         if (likely(checksummed && lro)) {
 707                 efx_rx_packet_lro(channel, rx_buf);
 708                 goto done;
 709         }
 710
 711         /* Form an skb if required */
 712         if (rx_buf->page) {
 713                 int hdr_len = min(rx_buf->len, EFX_SKB_HEADERS);
 714                 skb = efx_rx_mk_skb(rx_buf, efx, hdr_len);
 715                 if (unlikely(skb == NULL)) {
 716                         efx_free_rx_buffer(efx, rx_buf);
 717                         goto done;
 718                 }
 719         } else {
 720                 /* We now own the SKB */
 721                 skb = rx_buf->skb;
 722                 rx_buf->skb = NULL;
 723         }
 724
 725         EFX_BUG_ON_PARANOID(rx_buf->page);
 726         EFX_BUG_ON_PARANOID(rx_buf->skb);
 727         EFX_BUG_ON_PARANOID(!skb);
 728
 729         /* Set the SKB flags */
 730         if (unlikely(!checksummed || !efx->rx_checksum_enabled))
 731                 skb->ip_summed = CHECKSUM_NONE;
 732
 733         /* Pass the packet up */
 734         netif_receive_skb(skb);
 735
 736         /* Update allocation strategy method */
 737         channel->rx_alloc_level += RX_ALLOC_FACTOR_SKB;
 738
 739         /* fall-thru */
 740 done:
 741         efx->net_dev->last_rx = jiffies;
 742 }
 743
 744 void efx_rx_strategy(struct efx_channel *channel)
 745 {
 746         enum efx_rx_alloc_method method = rx_alloc_method;
 747
 748         /* Only makes sense to use page based allocation if LRO is enabled */
 749         if (!(channel->efx->net_dev->features & NETIF_F_LRO)) {
 750                 method = RX_ALLOC_METHOD_SKB;
 751         } else if (method == RX_ALLOC_METHOD_AUTO) {
 752                 /* Constrain the rx_alloc_level */
 753                 if (channel->rx_alloc_level < 0)
 754                         channel->rx_alloc_level = 0;
 755                 else if (channel->rx_alloc_level > RX_ALLOC_LEVEL_MAX)
 756                         channel->rx_alloc_level = RX_ALLOC_LEVEL_MAX;
 757
 758                 /* Decide on the allocation method */
 759                 method = ((channel->rx_alloc_level > RX_ALLOC_LEVEL_LRO) ?
 760                           RX_ALLOC_METHOD_PAGE : RX_ALLOC_METHOD_SKB);
 761         }
 762
 763         /* Push the option */
 764         channel->rx_alloc_push_pages = (method == RX_ALLOC_METHOD_PAGE);
 765 }
 766
 767 int efx_probe_rx_queue(struct efx_rx_queue *rx_queue)
 768 {
 769         struct efx_nic *efx = rx_queue->efx;
 770         unsigned int rxq_size;
 771         int rc;
 772
 773         EFX_LOG(efx, "creating RX queue %d\n", rx_queue->queue);
 774
 775         /* Allocate RX buffers */
 776         rxq_size = (efx->type->rxd_ring_mask + 1) * sizeof(*rx_queue->buffer);
 777         rx_queue->buffer = kzalloc(rxq_size, GFP_KERNEL);
 778         if (!rx_queue->buffer) {
 779                 rc = -ENOMEM;
 780                 goto fail1;
 781         }
 782
 783         rc = falcon_probe_rx(rx_queue);
 784         if (rc)
 785                 goto fail2;
 786
 787         return 0;
 788
 789  fail2:
 790         kfree(rx_queue->buffer);
 791         rx_queue->buffer = NULL;
 792  fail1:
 793         rx_queue->used = 0;
 794
 795         return rc;
 796 }
 797
 798 int efx_init_rx_queue(struct efx_rx_queue *rx_queue)
 799 {
 800         struct efx_nic *efx = rx_queue->efx;
 801         unsigned int max_fill, trigger, limit;
 802
 803         EFX_LOG(rx_queue->efx, "initialising RX queue %d\n", rx_queue->queue);
 804
 805         /* Initialise ptr fields */
 806         rx_queue->added_count = 0;
 807         rx_queue->notified_count = 0;
 808         rx_queue->removed_count = 0;
 809         rx_queue->min_fill = -1U;
 810         rx_queue->min_overfill = -1U;
 811
 812         /* Initialise limit fields */
 813         max_fill = efx->type->rxd_ring_mask + 1 - EFX_RXD_HEAD_ROOM;
 814         trigger = max_fill * min(rx_refill_threshold, 100U) / 100U;
 815         limit = max_fill * min(rx_refill_limit, 100U) / 100U;
 816
 817         rx_queue->max_fill = max_fill;
 818         rx_queue->fast_fill_trigger = trigger;
 819         rx_queue->fast_fill_limit = limit;
 820
 821         /* Set up RX descriptor ring */
 822         return falcon_init_rx(rx_queue);
 823 }
 824
 825 void efx_fini_rx_queue(struct efx_rx_queue *rx_queue)
 826 {
 827         int i;
 828         struct efx_rx_buffer *rx_buf;
 829
 830         EFX_LOG(rx_queue->efx, "shutting down RX queue %d\n", rx_queue->queue);
 831
 832         falcon_fini_rx(rx_queue);
 833
 834         /* Release RX buffers NB start at index 0 not current HW ptr */
 835         if (rx_queue->buffer) {
 836                 for (i = 0; i <= rx_queue->efx->type->rxd_ring_mask; i++) {
 837                         rx_buf = efx_rx_buffer(rx_queue, i);
 838                         efx_fini_rx_buffer(rx_queue, rx_buf);
 839                 }
 840         }
 841
 842         /* For a page that is part-way through splitting into RX buffers */
 843         if (rx_queue->buf_page != NULL) {
 844                 pci_unmap_page(rx_queue->efx->pci_dev, rx_queue->buf_dma_addr,
 845                                RX_PAGE_SIZE(rx_queue->efx), PCI_DMA_FROMDEVICE);
 846                 __free_pages(rx_queue->buf_page,
 847                              rx_queue->efx->rx_buffer_order);
 848                 rx_queue->buf_page = NULL;
 849         }
 850 }
 851
 852 void efx_remove_rx_queue(struct efx_rx_queue *rx_queue)
 853 {
 854         EFX_LOG(rx_queue->efx, "destroying RX queue %d\n", rx_queue->queue);
 855
 856         falcon_remove_rx(rx_queue);
 857
 858         kfree(rx_queue->buffer);
 859         rx_queue->buffer = NULL;
 860         rx_queue->used = 0;
 861 }
 862
 863 void efx_flush_lro(struct efx_channel *channel)
 864 {
 865         lro_flush_all(&channel->lro_mgr);
 866 }
 867
 868
 869 module_param(rx_alloc_method, int, 0644);
 870 MODULE_PARM_DESC(rx_alloc_method, "Allocation method used for RX buffers");
 871
 872 module_param(rx_refill_threshold, uint, 0444);
 873 MODULE_PARM_DESC(rx_refill_threshold,
 874                  "RX descriptor ring fast/slow fill threshold (%)");
 875