close
1/*  $OpenBSD: nvme.c,v 1.126 2026/01/14 01:07:57 jmatthew Exp $ */
2
3/*
4 * Copyright (c) 2014 David Gwynne <dlg@openbsd.org>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18
19#include "bio.h"
20
21#include <sys/param.h>
22#include <sys/ioctl.h>
23#include <sys/systm.h>
24#include <sys/buf.h>
25#include <sys/kernel.h>
26#include <sys/malloc.h>
27#include <sys/device.h>
28#include <sys/queue.h>
29#include <sys/mutex.h>
30#include <sys/pool.h>
31#include <sys/disk.h>
32
33#include <sys/atomic.h>
34
35#include <machine/bus.h>
36
37#include <scsi/scsi_all.h>
38#include <scsi/scsi_disk.h>
39#include <scsi/scsiconf.h>
40#include <scsi/sdvar.h>
41
42#include <dev/biovar.h>
43#include <dev/ic/nvmereg.h>
44#include <dev/ic/nvmevar.h>
45#include <dev/ic/nvmeio.h>
46
47struct cfdriver nvme_cd = {
48    NULL,
49    "nvme",
50    DV_DULL
51};
52
53int nvme_ready(struct nvme_softc *, u_int32_t);
54int nvme_enable(struct nvme_softc *);
55int nvme_disable(struct nvme_softc *);
56int nvme_shutdown(struct nvme_softc *);
57int nvme_resume(struct nvme_softc *);
58
59void    nvme_dumpregs(struct nvme_softc *);
60int nvme_identify(struct nvme_softc *, u_int);
61void    nvme_fill_identify(struct nvme_softc *, struct nvme_ccb *, void *);
62
63#ifndef SMALL_KERNEL
64void    nvme_refresh_sensors(void *);
65#endif
66
67int nvme_ccbs_alloc(struct nvme_softc *, u_int);
68void    nvme_ccbs_free(struct nvme_softc *, u_int);
69
70void *  nvme_ccb_get(void *);
71void    nvme_ccb_put(void *, void *);
72
73int nvme_poll(struct nvme_softc *, struct nvme_queue *, struct nvme_ccb *,
74        void (*)(struct nvme_softc *, struct nvme_ccb *, void *), u_int32_t);
75void    nvme_poll_fill(struct nvme_softc *, struct nvme_ccb *, void *);
76void    nvme_poll_done(struct nvme_softc *, struct nvme_ccb *);
77void    nvme_sqe_fill(struct nvme_softc *, struct nvme_ccb *, void *);
78void    nvme_empty_done(struct nvme_softc *, struct nvme_ccb *);
79
80struct nvme_queue *
81    nvme_q_alloc(struct nvme_softc *, u_int16_t, u_int, u_int);
82int nvme_q_create(struct nvme_softc *, struct nvme_queue *);
83int nvme_q_reset(struct nvme_softc *, struct nvme_queue *);
84int nvme_q_delete(struct nvme_softc *, struct nvme_queue *);
85void    nvme_q_submit(struct nvme_softc *,
86        struct nvme_queue *, struct nvme_ccb *,
87        void (*)(struct nvme_softc *, struct nvme_ccb *, void *));
88int nvme_q_complete(struct nvme_softc *, struct nvme_queue *);
89void    nvme_q_free(struct nvme_softc *, struct nvme_queue *);
90
91void    nvme_scsi_cmd(struct scsi_xfer *);
92void    nvme_minphys(struct buf *, struct scsi_link *);
93int nvme_scsi_probe(struct scsi_link *);
94void    nvme_scsi_free(struct scsi_link *);
95uint64_t nvme_scsi_size(const struct nvm_identify_namespace *);
96int nvme_scsi_ioctl(struct scsi_link *, u_long, caddr_t, int);
97int nvme_passthrough_cmd(struct nvme_softc *, struct nvme_pt_cmd *,
98    int, int);
99
100#ifdef HIBERNATE
101#include <uvm/uvm_extern.h>
102#include <sys/hibernate.h>
103#include <sys/disklabel.h>
104
105int nvme_hibernate_io(dev_t, daddr_t, vaddr_t, size_t, int, void *);
106#endif
107
108#if NBIO > 0
109void    nvme_bio_status(struct bio_status *, const char *, ...);
110
111const char *nvme_bioctl_sdname(const struct nvme_softc *, int);
112
113int nvme_bioctl(struct device *, u_long, caddr_t);
114int nvme_bioctl_inq(struct nvme_softc *, struct bioc_inq *);
115int nvme_bioctl_vol(struct nvme_softc *, struct bioc_vol *);
116int nvme_bioctl_disk(struct nvme_softc *, struct bioc_disk *);
117#endif  /* NBIO > 0 */
118
119const struct scsi_adapter nvme_switch = {
120    nvme_scsi_cmd, nvme_minphys, nvme_scsi_probe, nvme_scsi_free,
121    nvme_scsi_ioctl
122};
123
124void    nvme_scsi_io(struct scsi_xfer *, int);
125void    nvme_scsi_io_fill(struct nvme_softc *, struct nvme_ccb *, void *);
126void    nvme_scsi_io_done(struct nvme_softc *, struct nvme_ccb *);
127
128void    nvme_scsi_sync(struct scsi_xfer *);
129void    nvme_scsi_sync_fill(struct nvme_softc *, struct nvme_ccb *, void *);
130void    nvme_scsi_sync_done(struct nvme_softc *, struct nvme_ccb *);
131
132void    nvme_scsi_inq(struct scsi_xfer *);
133void    nvme_scsi_inquiry(struct scsi_xfer *);
134void    nvme_scsi_capacity16(struct scsi_xfer *);
135void    nvme_scsi_capacity(struct scsi_xfer *);
136
137uint32_t    nvme_op_sq_enter(struct nvme_softc *,
138            struct nvme_queue *, struct nvme_ccb *);
139void        nvme_op_sq_leave(struct nvme_softc *,
140            struct nvme_queue *, struct nvme_ccb *);
141uint32_t    nvme_op_sq_enter_locked(struct nvme_softc *,
142            struct nvme_queue *, struct nvme_ccb *);
143void        nvme_op_sq_leave_locked(struct nvme_softc *,
144            struct nvme_queue *, struct nvme_ccb *);
145
146void        nvme_op_cq_done(struct nvme_softc *,
147            struct nvme_queue *, struct nvme_ccb *);
148
149static const struct nvme_ops nvme_ops = {
150    .op_sq_enter        = nvme_op_sq_enter,
151    .op_sq_leave        = nvme_op_sq_leave,
152    .op_sq_enter_locked = nvme_op_sq_enter_locked,
153    .op_sq_leave_locked = nvme_op_sq_leave_locked,
154
155    .op_cq_done     = nvme_op_cq_done,
156};
157
158#define NVME_TIMO_QOP           5000    /* ms to create/delete queue */
159#define NVME_TIMO_PT            5000    /* ms to complete passthrough */
160#define NVME_TIMO_IDENT         10000   /* ms to probe/identify */
161#define NVME_TIMO_LOG_PAGE      5000    /* ms to read log pages */
162#define NVME_TIMO_DELAYNS       10  /* ns to delay() in poll loop */
163
164/*
165 * Some controllers, at least Apple NVMe, always require split
166 * transfers, so don't use bus_space_{read,write}_8() on LP64.
167 */
168u_int64_t
169nvme_read8(struct nvme_softc *sc, bus_size_t r)
170{
171    u_int64_t v;
172
173    v = (u_int64_t)nvme_read4(sc, r) |
174        (u_int64_t)nvme_read4(sc, r + 4) << 32;
175
176    return (v);
177}
178
179void
180nvme_write8(struct nvme_softc *sc, bus_size_t r, u_int64_t v)
181{
182    nvme_write4(sc, r, v);
183    nvme_write4(sc, r + 4, v >> 32);
184}
185
186void
187nvme_dumpregs(struct nvme_softc *sc)
188{
189    u_int64_t r8;
190    u_int32_t r4;
191
192    r8 = nvme_read8(sc, NVME_CAP);
193    printf("%s: cap  0x%016llx\n", DEVNAME(sc), nvme_read8(sc, NVME_CAP));
194    printf("%s:  mpsmax %u (%u)\n", DEVNAME(sc),
195        (u_int)NVME_CAP_MPSMAX(r8), (1 << NVME_CAP_MPSMAX(r8)));
196    printf("%s:  mpsmin %u (%u)\n", DEVNAME(sc),
197        (u_int)NVME_CAP_MPSMIN(r8), (1 << NVME_CAP_MPSMIN(r8)));
198    printf("%s:  css %llu\n", DEVNAME(sc), NVME_CAP_CSS(r8));
199    printf("%s:  nssrs %llu\n", DEVNAME(sc), NVME_CAP_NSSRS(r8));
200    printf("%s:  dstrd %u\n", DEVNAME(sc), NVME_CAP_DSTRD(r8));
201    printf("%s:  to %llu msec\n", DEVNAME(sc), NVME_CAP_TO(r8));
202    printf("%s:  ams %llu\n", DEVNAME(sc), NVME_CAP_AMS(r8));
203    printf("%s:  cqr %llu\n", DEVNAME(sc), NVME_CAP_CQR(r8));
204    printf("%s:  mqes %llu\n", DEVNAME(sc), NVME_CAP_MQES(r8));
205
206    printf("%s: vs   0x%04x\n", DEVNAME(sc), nvme_read4(sc, NVME_VS));
207
208    r4 = nvme_read4(sc, NVME_CC);
209    printf("%s: cc   0x%04x\n", DEVNAME(sc), r4);
210    printf("%s:  iocqes %u\n", DEVNAME(sc), NVME_CC_IOCQES_R(r4));
211    printf("%s:  iosqes %u\n", DEVNAME(sc), NVME_CC_IOSQES_R(r4));
212    printf("%s:  shn %u\n", DEVNAME(sc), NVME_CC_SHN_R(r4));
213    printf("%s:  ams %u\n", DEVNAME(sc), NVME_CC_AMS_R(r4));
214    printf("%s:  mps %u\n", DEVNAME(sc), NVME_CC_MPS_R(r4));
215    printf("%s:  css %u\n", DEVNAME(sc), NVME_CC_CSS_R(r4));
216    printf("%s:  en %u\n", DEVNAME(sc), ISSET(r4, NVME_CC_EN));
217
218    printf("%s: csts 0x%08x\n", DEVNAME(sc), nvme_read4(sc, NVME_CSTS));
219    printf("%s: aqa  0x%08x\n", DEVNAME(sc), nvme_read4(sc, NVME_AQA));
220    printf("%s: asq  0x%016llx\n", DEVNAME(sc), nvme_read8(sc, NVME_ASQ));
221    printf("%s: acq  0x%016llx\n", DEVNAME(sc), nvme_read8(sc, NVME_ACQ));
222}
223
224int
225nvme_ready(struct nvme_softc *sc, u_int32_t rdy)
226{
227    u_int i = 0;
228
229    while ((nvme_read4(sc, NVME_CSTS) & NVME_CSTS_RDY) != rdy) {
230        if (i++ > sc->sc_rdy_to)
231            return (1);
232
233        delay(1000);
234        nvme_barrier(sc, NVME_CSTS, 4, BUS_SPACE_BARRIER_READ);
235    }
236
237    return (0);
238}
239
240int
241nvme_enable(struct nvme_softc *sc)
242{
243    u_int32_t cc;
244
245    cc = nvme_read4(sc, NVME_CC);
246    if (ISSET(cc, NVME_CC_EN))
247        return (nvme_ready(sc, NVME_CSTS_RDY));
248
249    if (sc->sc_ops->op_enable != NULL)
250        sc->sc_ops->op_enable(sc);
251
252    nvme_write4(sc, NVME_AQA, NVME_AQA_ACQS(sc->sc_admin_q->q_entries) |
253        NVME_AQA_ASQS(sc->sc_admin_q->q_entries));
254    nvme_barrier(sc, 0, sc->sc_ios, BUS_SPACE_BARRIER_WRITE);
255
256    nvme_write8(sc, NVME_ASQ, NVME_DMA_DVA(sc->sc_admin_q->q_sq_dmamem));
257    nvme_barrier(sc, 0, sc->sc_ios, BUS_SPACE_BARRIER_WRITE);
258    nvme_write8(sc, NVME_ACQ, NVME_DMA_DVA(sc->sc_admin_q->q_cq_dmamem));
259    nvme_barrier(sc, 0, sc->sc_ios, BUS_SPACE_BARRIER_WRITE);
260
261    CLR(cc, NVME_CC_IOCQES_MASK | NVME_CC_IOSQES_MASK | NVME_CC_SHN_MASK |
262        NVME_CC_AMS_MASK | NVME_CC_MPS_MASK | NVME_CC_CSS_MASK);
263    SET(cc, NVME_CC_IOSQES(6)); /* Submission queue size == 2**6 (64) */
264    SET(cc, NVME_CC_IOCQES(4)); /* Completion queue size == 2**4 (16) */
265    SET(cc, NVME_CC_SHN(NVME_CC_SHN_NONE));
266    SET(cc, NVME_CC_CSS(NVME_CC_CSS_NVM));
267    SET(cc, NVME_CC_AMS(NVME_CC_AMS_RR));
268    SET(cc, NVME_CC_MPS(ffs(sc->sc_mps) - 1));
269    SET(cc, NVME_CC_EN);
270
271    nvme_write4(sc, NVME_CC, cc);
272    nvme_barrier(sc, 0, sc->sc_ios,
273        BUS_SPACE_BARRIER_READ | BUS_SPACE_BARRIER_WRITE);
274
275    return (nvme_ready(sc, NVME_CSTS_RDY));
276}
277
278int
279nvme_disable(struct nvme_softc *sc)
280{
281    u_int32_t cc, csts;
282
283    cc = nvme_read4(sc, NVME_CC);
284    if (ISSET(cc, NVME_CC_EN)) {
285        csts = nvme_read4(sc, NVME_CSTS);
286        if (!ISSET(csts, NVME_CSTS_CFS) &&
287            nvme_ready(sc, NVME_CSTS_RDY) != 0)
288            return (1);
289    }
290
291    CLR(cc, NVME_CC_EN);
292
293    nvme_write4(sc, NVME_CC, cc);
294    nvme_barrier(sc, 0, sc->sc_ios,
295        BUS_SPACE_BARRIER_READ | BUS_SPACE_BARRIER_WRITE);
296
297    return (nvme_ready(sc, 0));
298}
299
300int
301nvme_attach(struct nvme_softc *sc)
302{
303    struct scsibus_attach_args saa;
304    u_int64_t cap;
305    u_int32_t reg;
306    u_int nccbs = 0;
307
308    mtx_init(&sc->sc_ccb_mtx, IPL_BIO);
309    rw_init(&sc->sc_lock, "nvme_lock");
310    SIMPLEQ_INIT(&sc->sc_ccb_list);
311    scsi_iopool_init(&sc->sc_iopool, sc, nvme_ccb_get, nvme_ccb_put);
312    if (sc->sc_ops == NULL)
313        sc->sc_ops = &nvme_ops;
314    if (sc->sc_openings == 0)
315        sc->sc_openings = 64;
316
317    reg = nvme_read4(sc, NVME_VS);
318    if (reg == 0xffffffff) {
319        printf("invalid mapping\n");
320        return (1);
321    }
322
323    printf("NVMe %d.%d\n", NVME_VS_MJR(reg), NVME_VS_MNR(reg));
324
325    cap = nvme_read8(sc, NVME_CAP);
326    sc->sc_dstrd = NVME_CAP_DSTRD(cap);
327    if (NVME_CAP_MPSMIN(cap) > PAGE_SHIFT) {
328        printf("%s: NVMe minimum page size %u "
329            "is greater than CPU page size %u\n", DEVNAME(sc),
330            1 << NVME_CAP_MPSMIN(cap), 1 << PAGE_SHIFT);
331        return (1);
332    }
333    if (NVME_CAP_MPSMAX(cap) < PAGE_SHIFT)
334        sc->sc_mps = 1 << NVME_CAP_MPSMAX(cap);
335    else
336        sc->sc_mps = 1 << PAGE_SHIFT;
337
338    sc->sc_rdy_to = NVME_CAP_TO(cap);
339    sc->sc_mdts = MAXPHYS;
340    sc->sc_max_prpl = sc->sc_mdts / sc->sc_mps;
341
342    if (nvme_disable(sc) != 0) {
343        printf("%s: unable to disable controller\n", DEVNAME(sc));
344        return (1);
345    }
346
347    sc->sc_admin_q = nvme_q_alloc(sc, NVME_ADMIN_Q, 128, sc->sc_dstrd);
348    if (sc->sc_admin_q == NULL) {
349        printf("%s: unable to allocate admin queue\n", DEVNAME(sc));
350        return (1);
351    }
352
353    if (nvme_ccbs_alloc(sc, 16) != 0) {
354        printf("%s: unable to allocate initial ccbs\n", DEVNAME(sc));
355        goto free_admin_q;
356    }
357    nccbs = 16;
358
359    if (nvme_enable(sc) != 0) {
360        printf("%s: unable to enable controller\n", DEVNAME(sc));
361        goto free_ccbs;
362    }
363
364    if (nvme_identify(sc, NVME_CAP_MPSMIN(cap)) != 0) {
365        printf("%s: unable to identify controller\n", DEVNAME(sc));
366        goto disable;
367    }
368
369    /* We now know the real values of sc_mdts and sc_max_prpl. */
370    nvme_ccbs_free(sc, nccbs);
371    if (nvme_ccbs_alloc(sc, 64) != 0) {
372        printf("%s: unable to allocate ccbs\n", DEVNAME(sc));
373        goto free_admin_q;
374    }
375    nccbs = 64;
376
377    sc->sc_q = nvme_q_alloc(sc, NVME_IO_Q, 128, sc->sc_dstrd);
378    if (sc->sc_q == NULL) {
379        printf("%s: unable to allocate io q\n", DEVNAME(sc));
380        goto disable;
381    }
382
383    if (nvme_q_create(sc, sc->sc_q) != 0) {
384        printf("%s: unable to create io q\n", DEVNAME(sc));
385        goto free_q;
386    }
387
388#ifdef HIBERNATE
389    sc->sc_hib_q = nvme_q_alloc(sc, NVME_HIB_Q, 4, sc->sc_dstrd);
390    if (sc->sc_hib_q == NULL) {
391        printf("%s: unable to allocate hibernate io queue\n", DEVNAME(sc));
392        goto free_q;
393    }
394#endif
395
396    nvme_write4(sc, NVME_INTMC, 1);
397
398    sc->sc_namespaces = mallocarray(sc->sc_nn + 1,
399        sizeof(*sc->sc_namespaces), M_DEVBUF, M_WAITOK|M_ZERO);
400
401    saa.saa_adapter = &nvme_switch;
402    saa.saa_adapter_softc = sc;
403    saa.saa_adapter_buswidth = sc->sc_nn + 1;
404    saa.saa_luns = 1;
405    saa.saa_adapter_target = 0;
406    saa.saa_openings = sc->sc_openings;
407    saa.saa_pool = &sc->sc_iopool;
408    saa.saa_quirks = saa.saa_flags = 0;
409    saa.saa_wwpn = saa.saa_wwnn = 0;
410
411    strlcpy(sc->sc_sensordev.xname, DEVNAME(sc), sizeof(sc->sc_sensordev.xname));
412
413#ifndef SMALL_KERNEL
414    sc->sc_temp_sensor.type = SENSOR_TEMP;
415    sc->sc_temp_sensor.status = SENSOR_S_UNKNOWN;
416    sensor_attach(&sc->sc_sensordev, &sc->sc_temp_sensor);
417
418    sc->sc_usage_sensor.type = SENSOR_PERCENT;
419    sc->sc_usage_sensor.status = SENSOR_S_UNKNOWN;
420    strlcpy(sc->sc_usage_sensor.desc, "endurance used",
421        sizeof(sc->sc_usage_sensor.desc));
422    sensor_attach(&sc->sc_sensordev, &sc->sc_usage_sensor);
423
424    sc->sc_spare_sensor.type = SENSOR_PERCENT;
425    sc->sc_spare_sensor.status = SENSOR_S_UNKNOWN;
426    strlcpy(sc->sc_spare_sensor.desc, "available spare",
427        sizeof(sc->sc_spare_sensor.desc));
428    sensor_attach(&sc->sc_sensordev, &sc->sc_spare_sensor);
429
430    if (sensor_task_register(sc, nvme_refresh_sensors, 60) == NULL)
431        goto free_q;
432
433    sensordev_install(&sc->sc_sensordev);
434#endif
435
436    sc->sc_scsibus = (struct scsibus_softc *)config_found(&sc->sc_dev,
437        &saa, scsiprint);
438#if NBIO > 0
439    if (bio_register(&sc->sc_dev, nvme_bioctl) != 0)
440        printf("%s: unable to register bioctl\n", DEVNAME(sc));
441#endif  /* NBIO > 0 */
442
443    return (0);
444
445free_q:
446    nvme_q_free(sc, sc->sc_q);
447disable:
448    nvme_disable(sc);
449free_ccbs:
450    nvme_ccbs_free(sc, nccbs);
451free_admin_q:
452    nvme_q_free(sc, sc->sc_admin_q);
453
454    return (1);
455}
456
457int
458nvme_resume(struct nvme_softc *sc)
459{
460    if (nvme_disable(sc) != 0) {
461        printf("%s: unable to disable controller\n", DEVNAME(sc));
462        return (1);
463    }
464
465    if (nvme_q_reset(sc, sc->sc_admin_q) != 0) {
466        printf("%s: unable to reset admin queue\n", DEVNAME(sc));
467        return (1);
468    }
469
470    if (nvme_enable(sc) != 0) {
471        printf("%s: unable to enable controller\n", DEVNAME(sc));
472        return (1);
473    }
474
475    sc->sc_q = nvme_q_alloc(sc, NVME_IO_Q, 128, sc->sc_dstrd);
476    if (sc->sc_q == NULL) {
477        printf("%s: unable to allocate io q\n", DEVNAME(sc));
478        goto disable;
479    }
480
481    if (nvme_q_create(sc, sc->sc_q) != 0) {
482        printf("%s: unable to create io q\n", DEVNAME(sc));
483        goto free_q;
484    }
485
486    nvme_write4(sc, NVME_INTMC, 1);
487
488    return (0);
489
490free_q:
491    nvme_q_free(sc, sc->sc_q);
492disable:
493    nvme_disable(sc);
494
495    return (1);
496}
497
498int
499nvme_scsi_probe(struct scsi_link *link)
500{
501    struct nvme_softc *sc = link->bus->sb_adapter_softc;
502    struct nvme_sqe sqe;
503    struct nvm_identify_namespace *identify;
504    struct nvme_dmamem *mem;
505    struct nvme_ccb *ccb;
506    int rv;
507
508    ccb = scsi_io_get(&sc->sc_iopool, 0);
509    KASSERT(ccb != NULL);
510
511    mem = nvme_dmamem_alloc(sc, sizeof(*identify));
512    if (mem == NULL)
513        return (ENOMEM);
514
515    memset(&sqe, 0, sizeof(sqe));
516    sqe.opcode = NVM_ADMIN_IDENTIFY;
517    htolem32(&sqe.nsid, link->target);
518    htolem64(&sqe.entry.prp[0], NVME_DMA_DVA(mem));
519    htolem32(&sqe.cdw10, 0);
520
521    ccb->ccb_done = nvme_empty_done;
522    ccb->ccb_cookie = &sqe;
523
524    nvme_dmamem_sync(sc, mem, BUS_DMASYNC_PREREAD);
525    rv = nvme_poll(sc, sc->sc_admin_q, ccb, nvme_sqe_fill, NVME_TIMO_IDENT);
526    nvme_dmamem_sync(sc, mem, BUS_DMASYNC_POSTREAD);
527
528    scsi_io_put(&sc->sc_iopool, ccb);
529
530    identify = NVME_DMA_KVA(mem);
531    if (rv == 0) {
532        if (nvme_scsi_size(identify) > 0) {
533            /* Commit namespace if it has a size greater than zero. */
534            identify = malloc(sizeof(*identify), M_DEVBUF, M_WAITOK);
535            memcpy(identify, NVME_DMA_KVA(mem), sizeof(*identify));
536            sc->sc_namespaces[link->target].ident = identify;
537        } else {
538            /* Don't attach a namespace if its size is zero. */
539            rv = ENXIO;
540        }
541    }
542
543    nvme_dmamem_free(sc, mem);
544
545    return (rv);
546}
547
548int
549nvme_shutdown(struct nvme_softc *sc)
550{
551    u_int32_t cc, csts;
552    int i;
553
554    nvme_write4(sc, NVME_INTMC, 0);
555
556    if (nvme_q_delete(sc, sc->sc_q) != 0) {
557        printf("%s: unable to delete q, disabling\n", DEVNAME(sc));
558        goto disable;
559    }
560
561    cc = nvme_read4(sc, NVME_CC);
562    CLR(cc, NVME_CC_SHN_MASK);
563    SET(cc, NVME_CC_SHN(NVME_CC_SHN_NORMAL));
564    nvme_write4(sc, NVME_CC, cc);
565
566    for (i = 0; i < 4000; i++) {
567        nvme_barrier(sc, 0, sc->sc_ios,
568            BUS_SPACE_BARRIER_READ | BUS_SPACE_BARRIER_WRITE);
569        csts = nvme_read4(sc, NVME_CSTS);
570        if ((csts & NVME_CSTS_SHST_MASK) == NVME_CSTS_SHST_DONE)
571            return (0);
572
573        delay(1000);
574    }
575
576    printf("%s: unable to shutdown, disabling\n", DEVNAME(sc));
577
578disable:
579    nvme_disable(sc);
580    return (0);
581}
582
583int
584nvme_activate(struct nvme_softc *sc, int act)
585{
586    int rv;
587
588    switch (act) {
589    case DVACT_POWERDOWN:
590        rv = config_activate_children(&sc->sc_dev, act);
591        nvme_shutdown(sc);
592        break;
593    case DVACT_RESUME:
594        rv = nvme_resume(sc);
595        if (rv == 0)
596            rv = config_activate_children(&sc->sc_dev, act);
597        break;
598    default:
599        rv = config_activate_children(&sc->sc_dev, act);
600        break;
601    }
602
603    return (rv);
604}
605
606void
607nvme_scsi_cmd(struct scsi_xfer *xs)
608{
609    switch (xs->cmd.opcode) {
610    case READ_COMMAND:
611    case READ_10:
612    case READ_12:
613    case READ_16:
614        nvme_scsi_io(xs, SCSI_DATA_IN);
615        return;
616    case WRITE_COMMAND:
617    case WRITE_10:
618    case WRITE_12:
619    case WRITE_16:
620        nvme_scsi_io(xs, SCSI_DATA_OUT);
621        return;
622
623    case SYNCHRONIZE_CACHE:
624        nvme_scsi_sync(xs);
625        return;
626
627    case INQUIRY:
628        nvme_scsi_inq(xs);
629        return;
630    case READ_CAPACITY_16:
631        nvme_scsi_capacity16(xs);
632        return;
633    case READ_CAPACITY:
634        nvme_scsi_capacity(xs);
635        return;
636
637    case TEST_UNIT_READY:
638    case PREVENT_ALLOW:
639    case START_STOP:
640        xs->error = XS_NOERROR;
641        scsi_done(xs);
642        return;
643
644    default:
645        break;
646    }
647
648    xs->error = XS_DRIVER_STUFFUP;
649    scsi_done(xs);
650}
651
652void
653nvme_minphys(struct buf *bp, struct scsi_link *link)
654{
655    struct nvme_softc *sc = link->bus->sb_adapter_softc;
656
657    if (bp->b_bcount > sc->sc_mdts)
658        bp->b_bcount = sc->sc_mdts;
659}
660
661void
662nvme_scsi_io(struct scsi_xfer *xs, int dir)
663{
664    struct scsi_link *link = xs->sc_link;
665    struct nvme_softc *sc = link->bus->sb_adapter_softc;
666    struct nvme_ccb *ccb = xs->io;
667    bus_dmamap_t dmap = ccb->ccb_dmamap;
668    int i;
669
670    if ((xs->flags & (SCSI_DATA_IN|SCSI_DATA_OUT)) != dir)
671        goto stuffup;
672
673    ccb->ccb_done = nvme_scsi_io_done;
674    ccb->ccb_cookie = xs;
675
676    if (bus_dmamap_load(sc->sc_dmat, dmap,
677        xs->data, xs->datalen, NULL, ISSET(xs->flags, SCSI_NOSLEEP) ?
678        BUS_DMA_NOWAIT : BUS_DMA_WAITOK) != 0)
679        goto stuffup;
680
681    bus_dmamap_sync(sc->sc_dmat, dmap, 0, dmap->dm_mapsize,
682        ISSET(xs->flags, SCSI_DATA_IN) ?
683        BUS_DMASYNC_PREREAD : BUS_DMASYNC_PREWRITE);
684
685    if (dmap->dm_nsegs > 2) {
686        for (i = 1; i < dmap->dm_nsegs; i++) {
687            htolem64(&ccb->ccb_prpl[i - 1],
688                dmap->dm_segs[i].ds_addr);
689        }
690        bus_dmamap_sync(sc->sc_dmat,
691            NVME_DMA_MAP(sc->sc_ccb_prpls),
692            ccb->ccb_prpl_off,
693            sizeof(*ccb->ccb_prpl) * (dmap->dm_nsegs - 1),
694            BUS_DMASYNC_PREWRITE);
695    }
696
697    if (ISSET(xs->flags, SCSI_POLL)) {
698        nvme_poll(sc, sc->sc_q, ccb, nvme_scsi_io_fill, xs->timeout);
699        return;
700    }
701
702    nvme_q_submit(sc, sc->sc_q, ccb, nvme_scsi_io_fill);
703    return;
704
705stuffup:
706    xs->error = XS_DRIVER_STUFFUP;
707    scsi_done(xs);
708}
709
710void
711nvme_scsi_io_fill(struct nvme_softc *sc, struct nvme_ccb *ccb, void *slot)
712{
713    struct nvme_sqe_io *sqe = slot;
714    struct scsi_xfer *xs = ccb->ccb_cookie;
715    struct scsi_link *link = xs->sc_link;
716    bus_dmamap_t dmap = ccb->ccb_dmamap;
717    u_int64_t lba;
718    u_int32_t blocks;
719
720    scsi_cmd_rw_decode(&xs->cmd, &lba, &blocks);
721
722    sqe->opcode = ISSET(xs->flags, SCSI_DATA_IN) ?
723        NVM_CMD_READ : NVM_CMD_WRITE;
724    htolem32(&sqe->nsid, link->target);
725
726    htolem64(&sqe->entry.prp[0], dmap->dm_segs[0].ds_addr);
727    switch (dmap->dm_nsegs) {
728    case 1:
729        break;
730    case 2:
731        htolem64(&sqe->entry.prp[1], dmap->dm_segs[1].ds_addr);
732        break;
733    default:
734        /* the prp list is already set up and synced */
735        htolem64(&sqe->entry.prp[1], ccb->ccb_prpl_dva);
736        break;
737    }
738
739    htolem64(&sqe->slba, lba);
740    htolem16(&sqe->nlb, blocks - 1);
741}
742
743void
744nvme_scsi_io_done(struct nvme_softc *sc, struct nvme_ccb *ccb)
745{
746    struct scsi_xfer *xs = ccb->ccb_cookie;
747    bus_dmamap_t dmap = ccb->ccb_dmamap;
748
749    if (dmap->dm_nsegs > 2) {
750        bus_dmamap_sync(sc->sc_dmat,
751            NVME_DMA_MAP(sc->sc_ccb_prpls),
752            ccb->ccb_prpl_off,
753            sizeof(*ccb->ccb_prpl) * (dmap->dm_nsegs - 1),
754            BUS_DMASYNC_POSTWRITE);
755    }
756
757    bus_dmamap_sync(sc->sc_dmat, dmap, 0, dmap->dm_mapsize,
758        ISSET(xs->flags, SCSI_DATA_IN) ?
759        BUS_DMASYNC_POSTREAD : BUS_DMASYNC_POSTWRITE);
760
761    bus_dmamap_unload(sc->sc_dmat, dmap);
762
763    xs->error = (NVME_CQE_SC(ccb->ccb_cqe_flags) ==
764        NVME_CQE_SC_SUCCESS) ? XS_NOERROR : XS_DRIVER_STUFFUP;
765    xs->status = SCSI_OK;
766    xs->resid = 0;
767    scsi_done(xs);
768}
769
770void
771nvme_scsi_sync(struct scsi_xfer *xs)
772{
773    struct scsi_link *link = xs->sc_link;
774    struct nvme_softc *sc = link->bus->sb_adapter_softc;
775    struct nvme_ccb *ccb = xs->io;
776
777    ccb->ccb_done = nvme_scsi_sync_done;
778    ccb->ccb_cookie = xs;
779
780    if (ISSET(xs->flags, SCSI_POLL)) {
781        nvme_poll(sc, sc->sc_q, ccb, nvme_scsi_sync_fill, xs->timeout);
782        return;
783    }
784
785    nvme_q_submit(sc, sc->sc_q, ccb, nvme_scsi_sync_fill);
786}
787
788void
789nvme_scsi_sync_fill(struct nvme_softc *sc, struct nvme_ccb *ccb, void *slot)
790{
791    struct nvme_sqe *sqe = slot;
792    struct scsi_xfer *xs = ccb->ccb_cookie;
793    struct scsi_link *link = xs->sc_link;
794
795    sqe->opcode = NVM_CMD_FLUSH;
796    htolem32(&sqe->nsid, link->target);
797}
798
799void
800nvme_scsi_sync_done(struct nvme_softc *sc, struct nvme_ccb *ccb)
801{
802    struct scsi_xfer *xs = ccb->ccb_cookie;
803
804    xs->error = (NVME_CQE_SC(ccb->ccb_cqe_flags) ==
805        NVME_CQE_SC_SUCCESS) ? XS_NOERROR : XS_DRIVER_STUFFUP;
806    xs->status = SCSI_OK;
807    xs->resid = 0;
808    scsi_done(xs);
809}
810
811void
812nvme_scsi_inq(struct scsi_xfer *xs)
813{
814    struct scsi_inquiry *inq = (struct scsi_inquiry *)&xs->cmd;
815
816    if (!ISSET(inq->flags, SI_EVPD)) {
817        nvme_scsi_inquiry(xs);
818        return;
819    }
820
821    switch (inq->pagecode) {
822    default:
823        /* printf("%s: %d\n", __func__, inq->pagecode); */
824        break;
825    }
826
827    xs->error = XS_DRIVER_STUFFUP;
828    scsi_done(xs);
829}
830
831void
832nvme_scsi_inquiry(struct scsi_xfer *xs)
833{
834    struct scsi_inquiry_data inq;
835    struct scsi_link *link = xs->sc_link;
836    struct nvme_softc *sc = link->bus->sb_adapter_softc;
837    struct nvm_identify_namespace *ns;
838
839    ns = sc->sc_namespaces[link->target].ident;
840
841    memset(&inq, 0, sizeof(inq));
842
843    inq.device = T_DIRECT;
844    inq.version = SCSI_REV_SPC4;
845    inq.response_format = SID_SCSI2_RESPONSE;
846    inq.additional_length = SID_SCSI2_ALEN;
847    inq.flags |= SID_CmdQue;
848    memcpy(inq.vendor, "NVMe    ", sizeof(inq.vendor));
849    memcpy(inq.product, sc->sc_identify.mn, sizeof(inq.product));
850    memcpy(inq.revision, sc->sc_identify.fr, sizeof(inq.revision));
851
852    scsi_copy_internal_data(xs, &inq, sizeof(inq));
853
854    xs->error = XS_NOERROR;
855    scsi_done(xs);
856}
857
858void
859nvme_scsi_capacity16(struct scsi_xfer *xs)
860{
861    struct scsi_read_cap_data_16 rcd;
862    struct scsi_link *link = xs->sc_link;
863    struct nvme_softc *sc = link->bus->sb_adapter_softc;
864    struct nvm_identify_namespace *ns;
865    struct nvm_namespace_format *f;
866    u_int64_t addr;
867    u_int16_t tpe = READ_CAP_16_TPE;
868
869    ns = sc->sc_namespaces[link->target].ident;
870
871    if (xs->cmdlen != sizeof(struct scsi_read_capacity_16)) {
872        xs->error = XS_DRIVER_STUFFUP;
873        scsi_done(xs);
874        return;
875    }
876
877    addr = nvme_scsi_size(ns) - 1;
878    f = &ns->lbaf[NVME_ID_NS_FLBAS(ns->flbas)];
879
880    memset(&rcd, 0, sizeof(rcd));
881    _lto8b(addr, rcd.addr);
882    _lto4b(1 << f->lbads, rcd.length);
883    _lto2b(tpe, rcd.lowest_aligned);
884
885    memcpy(xs->data, &rcd, MIN(sizeof(rcd), xs->datalen));
886
887    xs->error = XS_NOERROR;
888    scsi_done(xs);
889}
890
891void
892nvme_scsi_capacity(struct scsi_xfer *xs)
893{
894    struct scsi_read_cap_data rcd;
895    struct scsi_link *link = xs->sc_link;
896    struct nvme_softc *sc = link->bus->sb_adapter_softc;
897    struct nvm_identify_namespace *ns;
898    struct nvm_namespace_format *f;
899    u_int64_t addr;
900
901    ns = sc->sc_namespaces[link->target].ident;
902
903    if (xs->cmdlen != sizeof(struct scsi_read_capacity)) {
904        xs->error = XS_DRIVER_STUFFUP;
905        scsi_done(xs);
906        return;
907    }
908
909    addr = nvme_scsi_size(ns) - 1;
910    if (addr > 0xffffffff)
911        addr = 0xffffffff;
912
913    f = &ns->lbaf[NVME_ID_NS_FLBAS(ns->flbas)];
914
915    memset(&rcd, 0, sizeof(rcd));
916    _lto4b(addr, rcd.addr);
917    _lto4b(1 << f->lbads, rcd.length);
918
919    memcpy(xs->data, &rcd, MIN(sizeof(rcd), xs->datalen));
920
921    xs->error = XS_NOERROR;
922    scsi_done(xs);
923}
924
925void
926nvme_scsi_free(struct scsi_link *link)
927{
928    struct nvme_softc *sc = link->bus->sb_adapter_softc;
929    struct nvm_identify_namespace *identify;
930
931    identify = sc->sc_namespaces[link->target].ident;
932    sc->sc_namespaces[link->target].ident = NULL;
933
934    free(identify, M_DEVBUF, sizeof(*identify));
935}
936
937uint64_t
938nvme_scsi_size(const struct nvm_identify_namespace *ns)
939{
940    uint64_t        ncap, nsze;
941
942    ncap = lemtoh64(&ns->ncap); /* Max allowed allocation. */
943    nsze = lemtoh64(&ns->nsze);
944
945    if ((ns->nsfeat & NVME_ID_NS_NSFEAT_THIN_PROV) && ncap < nsze)
946        return ncap;
947    else
948        return nsze;
949}
950
951int
952nvme_passthrough_cmd(struct nvme_softc *sc, struct nvme_pt_cmd *pt, int dv_unit,
953    int nsid)
954{
955    struct nvme_pt_status        pt_status;
956    struct nvme_sqe          sqe;
957    struct nvme_dmamem      *mem = NULL;
958    struct nvme_ccb         *ccb = NULL;
959    int              flags;
960    int              rv = 0;
961
962    ccb = scsi_io_get(&sc->sc_iopool, 0);
963    KASSERT(ccb != NULL);
964
965    memset(&sqe, 0, sizeof(sqe));
966    sqe.opcode = pt->pt_opcode;
967    htolem32(&sqe.nsid, pt->pt_nsid);
968    htolem32(&sqe.cdw10, pt->pt_cdw10);
969    htolem32(&sqe.cdw11, pt->pt_cdw11);
970    htolem32(&sqe.cdw12, pt->pt_cdw12);
971    htolem32(&sqe.cdw13, pt->pt_cdw13);
972    htolem32(&sqe.cdw14, pt->pt_cdw14);
973    htolem32(&sqe.cdw15, pt->pt_cdw15);
974
975    ccb->ccb_done = nvme_empty_done;
976    ccb->ccb_cookie = &sqe;
977
978    switch (pt->pt_opcode) {
979    case NVM_ADMIN_IDENTIFY:
980    case NVM_ADMIN_GET_LOG_PG:
981    case NVM_ADMIN_SELFTEST:
982        break;
983
984    default:
985        rv = ENOTTY;
986        goto done;
987    }
988
989    if (pt->pt_databuflen > 0) {
990        mem = nvme_dmamem_alloc(sc, pt->pt_databuflen);
991        if (mem == NULL) {
992            rv = ENOMEM;
993            goto done;
994        }
995        htolem64(&sqe.entry.prp[0], NVME_DMA_DVA(mem));
996        nvme_dmamem_sync(sc, mem, BUS_DMASYNC_PREREAD);
997    }
998
999    flags = nvme_poll(sc, sc->sc_admin_q, ccb, nvme_sqe_fill, NVME_TIMO_PT);
1000
1001    if (pt->pt_databuflen > 0) {
1002        nvme_dmamem_sync(sc, mem, BUS_DMASYNC_POSTREAD);
1003        if (flags == 0)
1004            rv = copyout(NVME_DMA_KVA(mem), pt->pt_databuf,
1005                pt->pt_databuflen);
1006    }
1007
1008    if (rv == 0 && pt->pt_statuslen > 0) {
1009        pt_status.ps_dv_unit = dv_unit;
1010        pt_status.ps_nsid = nsid;
1011        pt_status.ps_flags = flags;
1012        pt_status.ps_cc = nvme_read4(sc, NVME_CC);
1013        pt_status.ps_csts = nvme_read4(sc, NVME_CSTS);
1014        rv = copyout(&pt_status, pt->pt_status, pt->pt_statuslen);
1015    }
1016
1017 done:
1018    if (mem)
1019        nvme_dmamem_free(sc, mem);
1020    if (ccb)
1021        nvme_ccb_put(sc, ccb);
1022
1023    return rv;
1024}
1025
1026int
1027nvme_scsi_ioctl(struct scsi_link *link, u_long cmd, caddr_t addr, int flag)
1028{
1029    struct nvme_softc       *sc = link->bus->sb_adapter_softc;
1030    struct nvme_pt_cmd      *pt = (struct nvme_pt_cmd *)addr;
1031    int              rv;
1032
1033    switch (cmd) {
1034    case NVME_PASSTHROUGH_CMD:
1035        break;
1036    default:
1037        return ENOTTY;
1038    }
1039
1040    if ((pt->pt_cdw10 & 0xff) == 0)
1041        pt->pt_nsid = link->target;
1042
1043    rw_enter_write(&sc->sc_lock);
1044    rv = nvme_passthrough_cmd(sc, pt, sc->sc_dev.dv_unit, link->target);
1045    rw_exit_write(&sc->sc_lock);
1046    if (rv)
1047        goto done;
1048
1049 done:
1050    return rv;
1051}
1052
1053uint32_t
1054nvme_op_sq_enter(struct nvme_softc *sc,
1055    struct nvme_queue *q, struct nvme_ccb *ccb)
1056{
1057    mtx_enter(&q->q_sq_mtx);
1058    return (nvme_op_sq_enter_locked(sc, q, ccb));
1059}
1060
1061uint32_t
1062nvme_op_sq_enter_locked(struct nvme_softc *sc,
1063    struct nvme_queue *q, struct nvme_ccb *ccb)
1064{
1065    return (q->q_sq_tail);
1066}
1067
1068void
1069nvme_op_sq_leave_locked(struct nvme_softc *sc,
1070    struct nvme_queue *q, struct nvme_ccb *ccb)
1071{
1072    uint32_t tail;
1073
1074    tail = ++q->q_sq_tail;
1075    if (tail >= q->q_entries)
1076        tail = 0;
1077    q->q_sq_tail = tail;
1078    nvme_write4(sc, q->q_sqtdbl, tail);
1079}
1080
1081void
1082nvme_op_sq_leave(struct nvme_softc *sc,
1083    struct nvme_queue *q, struct nvme_ccb *ccb)
1084{
1085    nvme_op_sq_leave_locked(sc, q, ccb);
1086    mtx_leave(&q->q_sq_mtx);
1087}
1088
1089void
1090nvme_q_submit(struct nvme_softc *sc, struct nvme_queue *q, struct nvme_ccb *ccb,
1091    void (*fill)(struct nvme_softc *, struct nvme_ccb *, void *))
1092{
1093    struct nvme_sqe *sqe = NVME_DMA_KVA(q->q_sq_dmamem);
1094    u_int32_t tail;
1095
1096    tail = sc->sc_ops->op_sq_enter(sc, q, ccb);
1097
1098    sqe += tail;
1099
1100    bus_dmamap_sync(sc->sc_dmat, NVME_DMA_MAP(q->q_sq_dmamem),
1101        sizeof(*sqe) * tail, sizeof(*sqe), BUS_DMASYNC_POSTWRITE);
1102    memset(sqe, 0, sizeof(*sqe));
1103    (*fill)(sc, ccb, sqe);
1104    sqe->cid = ccb->ccb_id;
1105    bus_dmamap_sync(sc->sc_dmat, NVME_DMA_MAP(q->q_sq_dmamem),
1106        sizeof(*sqe) * tail, sizeof(*sqe), BUS_DMASYNC_PREWRITE);
1107
1108    sc->sc_ops->op_sq_leave(sc, q, ccb);
1109}
1110
1111struct nvme_poll_state {
1112    struct nvme_sqe s;
1113    struct nvme_cqe c;
1114};
1115
1116int
1117nvme_poll(struct nvme_softc *sc, struct nvme_queue *q, struct nvme_ccb *ccb,
1118    void (*fill)(struct nvme_softc *, struct nvme_ccb *, void *), u_int32_t ms)
1119{
1120    struct nvme_poll_state state;
1121    void (*done)(struct nvme_softc *, struct nvme_ccb *);
1122    void *cookie;
1123    int64_t us;
1124
1125    memset(&state, 0, sizeof(state));
1126    (*fill)(sc, ccb, &state.s);
1127
1128    done = ccb->ccb_done;
1129    cookie = ccb->ccb_cookie;
1130
1131    ccb->ccb_done = nvme_poll_done;
1132    ccb->ccb_cookie = &state;
1133
1134    nvme_q_submit(sc, q, ccb, nvme_poll_fill);
1135    for (us = ms * 1000; ms == 0 || us > 0; us -= NVME_TIMO_DELAYNS) {
1136        if (ISSET(state.c.flags, NVME_CQE_PHASE))
1137            break;
1138        if (nvme_q_complete(sc, q) == 0)
1139            delay(NVME_TIMO_DELAYNS);
1140        nvme_barrier(sc, NVME_CSTS, 4, BUS_SPACE_BARRIER_READ);
1141    }
1142
1143    ccb->ccb_cookie = cookie;
1144    done(sc, ccb);
1145
1146    return (ccb->ccb_cqe_flags & ~NVME_CQE_PHASE);
1147}
1148
1149void
1150nvme_poll_fill(struct nvme_softc *sc, struct nvme_ccb *ccb, void *slot)
1151{
1152    struct nvme_sqe *sqe = slot;
1153    struct nvme_poll_state *state = ccb->ccb_cookie;
1154
1155    *sqe = state->s;
1156}
1157
1158void
1159nvme_poll_done(struct nvme_softc *sc, struct nvme_ccb *ccb)
1160{
1161    struct nvme_poll_state *state = ccb->ccb_cookie;
1162
1163    state->c.flags = ccb->ccb_cqe_flags;
1164    SET(state->c.flags, NVME_CQE_PHASE);
1165}
1166
1167void
1168nvme_sqe_fill(struct nvme_softc *sc, struct nvme_ccb *ccb, void *slot)
1169{
1170    struct nvme_sqe *src = ccb->ccb_cookie;
1171    struct nvme_sqe *dst = slot;
1172
1173    *dst = *src;
1174}
1175
1176void
1177nvme_empty_done(struct nvme_softc *sc, struct nvme_ccb *ccb)
1178{
1179}
1180
1181void
1182nvme_op_cq_done(struct nvme_softc *sc,
1183    struct nvme_queue *q, struct nvme_ccb *ccb)
1184{
1185    /* nop */
1186}
1187
1188int
1189nvme_q_complete(struct nvme_softc *sc, struct nvme_queue *q)
1190{
1191    struct nvme_ccb *ccb, *ccbtmp;
1192    struct nvme_cqe *ring = NVME_DMA_KVA(q->q_cq_dmamem), *cqe;
1193    u_int32_t head;
1194    u_int16_t flags;
1195    struct nvme_ccb_list done_list;
1196    int rv = 0;
1197
1198    if (!mtx_enter_try(&q->q_cq_mtx))
1199        return (-1);
1200
1201    SIMPLEQ_INIT(&done_list);
1202    head = q->q_cq_head;
1203
1204    nvme_dmamem_sync(sc, q->q_cq_dmamem, BUS_DMASYNC_POSTREAD);
1205    for (;;) {
1206        cqe = &ring[head];
1207        flags = lemtoh16(&cqe->flags);
1208        if ((flags & NVME_CQE_PHASE) != q->q_cq_phase)
1209            break;
1210
1211        membar_consumer();
1212
1213        ccb = &sc->sc_ccbs[cqe->cid];
1214        sc->sc_ops->op_cq_done(sc, q, ccb);
1215
1216        ccb->ccb_cqe_flags = lemtoh16(&cqe->flags);
1217        SIMPLEQ_INSERT_TAIL(&done_list, ccb, ccb_entry);
1218
1219        if (++head >= q->q_entries) {
1220            head = 0;
1221            q->q_cq_phase ^= NVME_CQE_PHASE;
1222        }
1223
1224        rv = 1;
1225    }
1226    nvme_dmamem_sync(sc, q->q_cq_dmamem, BUS_DMASYNC_PREREAD);
1227
1228    if (rv)
1229        nvme_write4(sc, q->q_cqhdbl, q->q_cq_head = head);
1230    mtx_leave(&q->q_cq_mtx);
1231
1232    SIMPLEQ_FOREACH_SAFE(ccb, &done_list, ccb_entry, ccbtmp) {
1233        ccb->ccb_done(sc, ccb);
1234    }
1235
1236    return (rv);
1237}
1238
1239int
1240nvme_identify(struct nvme_softc *sc, u_int mpsmin)
1241{
1242    char sn[41], mn[81], fr[17];
1243    struct nvm_identify_controller *identify;
1244    struct nvme_dmamem *mem;
1245    struct nvme_ccb *ccb;
1246    int rv = 1;
1247
1248    ccb = nvme_ccb_get(sc);
1249    if (ccb == NULL)
1250        panic("nvme_identify: nvme_ccb_get returned NULL");
1251
1252    mem = nvme_dmamem_alloc(sc, sizeof(*identify));
1253    if (mem == NULL)
1254        return (1);
1255
1256    ccb->ccb_done = nvme_empty_done;
1257    ccb->ccb_cookie = mem;
1258
1259    nvme_dmamem_sync(sc, mem, BUS_DMASYNC_PREREAD);
1260    rv = nvme_poll(sc, sc->sc_admin_q, ccb, nvme_fill_identify,
1261        NVME_TIMO_IDENT);
1262    nvme_dmamem_sync(sc, mem, BUS_DMASYNC_POSTREAD);
1263
1264    nvme_ccb_put(sc, ccb);
1265
1266    if (rv != 0)
1267        goto done;
1268
1269    identify = NVME_DMA_KVA(mem);
1270
1271    scsi_strvis(sn, identify->sn, sizeof(identify->sn));
1272    scsi_strvis(mn, identify->mn, sizeof(identify->mn));
1273    scsi_strvis(fr, identify->fr, sizeof(identify->fr));
1274
1275    printf("%s: %s, firmware %s, serial %s\n", DEVNAME(sc), mn, fr, sn);
1276
1277    if (identify->mdts > 0) {
1278        sc->sc_mdts = (1 << identify->mdts) * (1 << mpsmin);
1279        if (sc->sc_mdts > NVME_MAXPHYS)
1280            sc->sc_mdts = NVME_MAXPHYS;
1281        sc->sc_max_prpl = sc->sc_mdts / sc->sc_mps;
1282    }
1283
1284    sc->sc_nn = lemtoh32(&identify->nn);
1285
1286    /*
1287     * At least one Apple NVMe device presents a second, bogus disk that is
1288     * inaccessible, so cap targets at 1.
1289     *
1290     * sd1 at scsibus1 targ 2 lun 0: <NVMe, APPLE SSD AP0512, 16.1> [..]
1291     * sd1: 0MB, 4096 bytes/sector, 2 sectors
1292     */
1293    if (sc->sc_nn > 1 &&
1294        mn[0] == 'A' && mn[1] == 'P' && mn[2] == 'P' && mn[3] == 'L' &&
1295        mn[4] == 'E')
1296        sc->sc_nn = 1;
1297
1298    memcpy(&sc->sc_identify, identify, sizeof(sc->sc_identify));
1299
1300done:
1301    nvme_dmamem_free(sc, mem);
1302
1303    return (rv);
1304}
1305
1306int
1307nvme_q_create(struct nvme_softc *sc, struct nvme_queue *q)
1308{
1309    struct nvme_sqe_q sqe;
1310    struct nvme_ccb *ccb;
1311    int rv;
1312
1313    ccb = scsi_io_get(&sc->sc_iopool, 0);
1314    KASSERT(ccb != NULL);
1315
1316    ccb->ccb_done = nvme_empty_done;
1317    ccb->ccb_cookie = &sqe;
1318
1319    memset(&sqe, 0, sizeof(sqe));
1320    sqe.opcode = NVM_ADMIN_ADD_IOCQ;
1321    htolem64(&sqe.prp1, NVME_DMA_DVA(q->q_cq_dmamem));
1322    htolem16(&sqe.qsize, q->q_entries - 1);
1323    htolem16(&sqe.qid, q->q_id);
1324    sqe.qflags = NVM_SQE_CQ_IEN | NVM_SQE_Q_PC;
1325
1326    rv = nvme_poll(sc, sc->sc_admin_q, ccb, nvme_sqe_fill, NVME_TIMO_QOP);
1327    if (rv != 0)
1328        goto fail;
1329
1330    ccb->ccb_done = nvme_empty_done;
1331    ccb->ccb_cookie = &sqe;
1332
1333    memset(&sqe, 0, sizeof(sqe));
1334    sqe.opcode = NVM_ADMIN_ADD_IOSQ;
1335    htolem64(&sqe.prp1, NVME_DMA_DVA(q->q_sq_dmamem));
1336    htolem16(&sqe.qsize, q->q_entries - 1);
1337    htolem16(&sqe.qid, q->q_id);
1338    htolem16(&sqe.cqid, q->q_id);
1339    sqe.qflags = NVM_SQE_Q_PC;
1340
1341    rv = nvme_poll(sc, sc->sc_admin_q, ccb, nvme_sqe_fill, NVME_TIMO_QOP);
1342    if (rv != 0)
1343        goto fail;
1344
1345fail:
1346    scsi_io_put(&sc->sc_iopool, ccb);
1347    return (rv);
1348}
1349
1350int
1351nvme_q_delete(struct nvme_softc *sc, struct nvme_queue *q)
1352{
1353    struct nvme_sqe_q sqe;
1354    struct nvme_ccb *ccb;
1355    int rv;
1356
1357    ccb = scsi_io_get(&sc->sc_iopool, 0);
1358    KASSERT(ccb != NULL);
1359
1360    ccb->ccb_done = nvme_empty_done;
1361    ccb->ccb_cookie = &sqe;
1362
1363    memset(&sqe, 0, sizeof(sqe));
1364    sqe.opcode = NVM_ADMIN_DEL_IOSQ;
1365    htolem16(&sqe.qid, q->q_id);
1366
1367    rv = nvme_poll(sc, sc->sc_admin_q, ccb, nvme_sqe_fill, NVME_TIMO_QOP);
1368    if (rv != 0)
1369        goto fail;
1370
1371    ccb->ccb_done = nvme_empty_done;
1372    ccb->ccb_cookie = &sqe;
1373
1374    memset(&sqe, 0, sizeof(sqe));
1375    sqe.opcode = NVM_ADMIN_DEL_IOCQ;
1376    htolem16(&sqe.qid, q->q_id);
1377
1378    rv = nvme_poll(sc, sc->sc_admin_q, ccb, nvme_sqe_fill, NVME_TIMO_QOP);
1379    if (rv != 0)
1380        goto fail;
1381
1382    nvme_q_free(sc, q);
1383
1384fail:
1385    scsi_io_put(&sc->sc_iopool, ccb);
1386    return (rv);
1387
1388}
1389
1390void
1391nvme_fill_identify(struct nvme_softc *sc, struct nvme_ccb *ccb, void *slot)
1392{
1393    struct nvme_sqe *sqe = slot;
1394    struct nvme_dmamem *mem = ccb->ccb_cookie;
1395
1396    sqe->opcode = NVM_ADMIN_IDENTIFY;
1397    htolem64(&sqe->entry.prp[0], NVME_DMA_DVA(mem));
1398    htolem32(&sqe->cdw10, 1);
1399}
1400
1401int
1402nvme_ccbs_alloc(struct nvme_softc *sc, u_int nccbs)
1403{
1404    struct nvme_ccb *ccb;
1405    bus_addr_t off;
1406    u_int64_t *prpl;
1407    u_int i;
1408
1409    sc->sc_ccbs = mallocarray(nccbs, sizeof(*ccb), M_DEVBUF,
1410        M_WAITOK | M_CANFAIL);
1411    if (sc->sc_ccbs == NULL)
1412        return (1);
1413
1414    sc->sc_ccb_prpls = nvme_dmamem_alloc(sc,
1415        sizeof(*prpl) * sc->sc_max_prpl * nccbs);
1416
1417    prpl = NVME_DMA_KVA(sc->sc_ccb_prpls);
1418    off = 0;
1419
1420    for (i = 0; i < nccbs; i++) {
1421        ccb = &sc->sc_ccbs[i];
1422
1423        if (bus_dmamap_create(sc->sc_dmat, sc->sc_mdts,
1424            sc->sc_max_prpl + 1, /* we get a free prp in the sqe */
1425            sc->sc_mps, sc->sc_mps,
1426            BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW | BUS_DMA_64BIT,
1427            &ccb->ccb_dmamap) != 0)
1428            goto free_maps;
1429
1430        ccb->ccb_id = i;
1431        ccb->ccb_prpl = prpl;
1432        ccb->ccb_prpl_off = off;
1433        ccb->ccb_prpl_dva = NVME_DMA_DVA(sc->sc_ccb_prpls) + off;
1434
1435        SIMPLEQ_INSERT_TAIL(&sc->sc_ccb_list, ccb, ccb_entry);
1436
1437        prpl += sc->sc_max_prpl;
1438        off += sizeof(*prpl) * sc->sc_max_prpl;
1439    }
1440
1441    return (0);
1442
1443free_maps:
1444    nvme_ccbs_free(sc, nccbs);
1445    return (1);
1446}
1447
1448void *
1449nvme_ccb_get(void *cookie)
1450{
1451    struct nvme_softc *sc = cookie;
1452    struct nvme_ccb *ccb;
1453
1454    mtx_enter(&sc->sc_ccb_mtx);
1455    ccb = SIMPLEQ_FIRST(&sc->sc_ccb_list);
1456    if (ccb != NULL)
1457        SIMPLEQ_REMOVE_HEAD(&sc->sc_ccb_list, ccb_entry);
1458    mtx_leave(&sc->sc_ccb_mtx);
1459
1460    return (ccb);
1461}
1462
1463void
1464nvme_ccb_put(void *cookie, void *io)
1465{
1466    struct nvme_softc *sc = cookie;
1467    struct nvme_ccb *ccb = io;
1468
1469    mtx_enter(&sc->sc_ccb_mtx);
1470    SIMPLEQ_INSERT_HEAD(&sc->sc_ccb_list, ccb, ccb_entry);
1471    mtx_leave(&sc->sc_ccb_mtx);
1472}
1473
1474void
1475nvme_ccbs_free(struct nvme_softc *sc, unsigned int nccbs)
1476{
1477    struct nvme_ccb *ccb;
1478
1479    while ((ccb = SIMPLEQ_FIRST(&sc->sc_ccb_list)) != NULL) {
1480        SIMPLEQ_REMOVE_HEAD(&sc->sc_ccb_list, ccb_entry);
1481        bus_dmamap_destroy(sc->sc_dmat, ccb->ccb_dmamap);
1482    }
1483
1484    nvme_dmamem_free(sc, sc->sc_ccb_prpls);
1485    free(sc->sc_ccbs, M_DEVBUF, nccbs * sizeof(*ccb));
1486}
1487
1488struct nvme_queue *
1489nvme_q_alloc(struct nvme_softc *sc, u_int16_t id, u_int entries, u_int dstrd)
1490{
1491    struct nvme_queue *q;
1492
1493    q = malloc(sizeof(*q), M_DEVBUF, M_WAITOK | M_CANFAIL);
1494    if (q == NULL)
1495        return (NULL);
1496
1497    q->q_sq_dmamem = nvme_dmamem_alloc(sc,
1498        sizeof(struct nvme_sqe) * entries);
1499    if (q->q_sq_dmamem == NULL)
1500        goto free;
1501
1502    q->q_cq_dmamem = nvme_dmamem_alloc(sc,
1503        sizeof(struct nvme_cqe) * entries);
1504    if (q->q_cq_dmamem == NULL)
1505        goto free_sq;
1506
1507    memset(NVME_DMA_KVA(q->q_sq_dmamem), 0, NVME_DMA_LEN(q->q_sq_dmamem));
1508    memset(NVME_DMA_KVA(q->q_cq_dmamem), 0, NVME_DMA_LEN(q->q_cq_dmamem));
1509
1510    mtx_init(&q->q_sq_mtx, IPL_BIO);
1511    mtx_init(&q->q_cq_mtx, IPL_BIO);
1512    q->q_sqtdbl = NVME_SQTDBL(id, dstrd);
1513    q->q_cqhdbl = NVME_CQHDBL(id, dstrd);
1514
1515    q->q_id = id;
1516    q->q_entries = entries;
1517    q->q_sq_tail = 0;
1518    q->q_cq_head = 0;
1519    q->q_cq_phase = NVME_CQE_PHASE;
1520
1521    if (sc->sc_ops->op_q_alloc != NULL) {
1522        if (sc->sc_ops->op_q_alloc(sc, q) != 0)
1523            goto free_cq;
1524    }
1525
1526    nvme_dmamem_sync(sc, q->q_sq_dmamem, BUS_DMASYNC_PREWRITE);
1527    nvme_dmamem_sync(sc, q->q_cq_dmamem, BUS_DMASYNC_PREREAD);
1528
1529    return (q);
1530
1531free_cq:
1532    nvme_dmamem_free(sc, q->q_cq_dmamem);
1533free_sq:
1534    nvme_dmamem_free(sc, q->q_sq_dmamem);
1535free:
1536    free(q, M_DEVBUF, sizeof *q);
1537
1538    return (NULL);
1539}
1540
1541int
1542nvme_q_reset(struct nvme_softc *sc, struct nvme_queue *q)
1543{
1544    memset(NVME_DMA_KVA(q->q_sq_dmamem), 0, NVME_DMA_LEN(q->q_sq_dmamem));
1545    memset(NVME_DMA_KVA(q->q_cq_dmamem), 0, NVME_DMA_LEN(q->q_cq_dmamem));
1546
1547    q->q_sq_tail = 0;
1548    q->q_cq_head = 0;
1549    q->q_cq_phase = NVME_CQE_PHASE;
1550
1551    nvme_dmamem_sync(sc, q->q_sq_dmamem, BUS_DMASYNC_PREWRITE);
1552    nvme_dmamem_sync(sc, q->q_cq_dmamem, BUS_DMASYNC_PREREAD);
1553
1554    return (0);
1555}
1556
1557void
1558nvme_q_free(struct nvme_softc *sc, struct nvme_queue *q)
1559{
1560    nvme_dmamem_sync(sc, q->q_cq_dmamem, BUS_DMASYNC_POSTREAD);
1561    nvme_dmamem_sync(sc, q->q_sq_dmamem, BUS_DMASYNC_POSTWRITE);
1562
1563    if (sc->sc_ops->op_q_free != NULL)
1564        sc->sc_ops->op_q_free(sc, q);
1565
1566    nvme_dmamem_free(sc, q->q_cq_dmamem);
1567    nvme_dmamem_free(sc, q->q_sq_dmamem);
1568    free(q, M_DEVBUF, sizeof *q);
1569}
1570
1571int
1572nvme_intr(void *xsc)
1573{
1574    struct nvme_softc *sc = xsc;
1575    int rv = 0;
1576
1577    if (nvme_q_complete(sc, sc->sc_q))
1578        rv = 1;
1579    if (nvme_q_complete(sc, sc->sc_admin_q))
1580        rv = 1;
1581
1582    return (rv);
1583}
1584
1585int
1586nvme_intr_intx(void *xsc)
1587{
1588    struct nvme_softc *sc = xsc;
1589    int rv;
1590
1591    nvme_write4(sc, NVME_INTMS, 1);
1592    rv = nvme_intr(sc);
1593    nvme_write4(sc, NVME_INTMC, 1);
1594
1595    return (rv);
1596}
1597
1598struct nvme_dmamem *
1599nvme_dmamem_alloc(struct nvme_softc *sc, size_t size)
1600{
1601    struct nvme_dmamem *ndm;
1602    int nsegs;
1603
1604    ndm = malloc(sizeof(*ndm), M_DEVBUF, M_WAITOK | M_ZERO);
1605    if (ndm == NULL)
1606        return (NULL);
1607
1608    ndm->ndm_size = size;
1609
1610    if (bus_dmamap_create(sc->sc_dmat, size, 1, size, 0,
1611        BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW | BUS_DMA_64BIT,
1612        &ndm->ndm_map) != 0)
1613        goto ndmfree;
1614
1615    if (bus_dmamem_alloc(sc->sc_dmat, size, sc->sc_mps, 0, &ndm->ndm_seg,
1616        1, &nsegs, BUS_DMA_WAITOK | BUS_DMA_ZERO | BUS_DMA_64BIT) != 0)
1617        goto destroy;
1618
1619    if (bus_dmamem_map(sc->sc_dmat, &ndm->ndm_seg, nsegs, size,
1620        &ndm->ndm_kva, BUS_DMA_WAITOK) != 0)
1621        goto free;
1622
1623    if (bus_dmamap_load(sc->sc_dmat, ndm->ndm_map, ndm->ndm_kva, size,
1624        NULL, BUS_DMA_WAITOK) != 0)
1625        goto unmap;
1626
1627    return (ndm);
1628
1629unmap:
1630    bus_dmamem_unmap(sc->sc_dmat, ndm->ndm_kva, size);
1631free:
1632    bus_dmamem_free(sc->sc_dmat, &ndm->ndm_seg, 1);
1633destroy:
1634    bus_dmamap_destroy(sc->sc_dmat, ndm->ndm_map);
1635ndmfree:
1636    free(ndm, M_DEVBUF, sizeof *ndm);
1637
1638    return (NULL);
1639}
1640
1641void
1642nvme_dmamem_sync(struct nvme_softc *sc, struct nvme_dmamem *mem, int ops)
1643{
1644    bus_dmamap_sync(sc->sc_dmat, NVME_DMA_MAP(mem),
1645        0, NVME_DMA_LEN(mem), ops);
1646}
1647
1648void
1649nvme_dmamem_free(struct nvme_softc *sc, struct nvme_dmamem *ndm)
1650{
1651    bus_dmamap_unload(sc->sc_dmat, ndm->ndm_map);
1652    bus_dmamem_unmap(sc->sc_dmat, ndm->ndm_kva, ndm->ndm_size);
1653    bus_dmamem_free(sc->sc_dmat, &ndm->ndm_seg, 1);
1654    bus_dmamap_destroy(sc->sc_dmat, ndm->ndm_map);
1655    free(ndm, M_DEVBUF, sizeof *ndm);
1656}
1657
1658#ifdef HIBERNATE
1659
1660int
1661nvme_hibernate_admin_cmd(struct nvme_softc *sc, struct nvme_sqe *sqe,
1662    struct nvme_cqe *cqe, int cid)
1663{
1664    struct nvme_sqe *asqe = NVME_DMA_KVA(sc->sc_admin_q->q_sq_dmamem);
1665    struct nvme_cqe *acqe = NVME_DMA_KVA(sc->sc_admin_q->q_cq_dmamem);
1666    struct nvme_queue *q = sc->sc_admin_q;
1667    int tail;
1668    u_int16_t flags;
1669
1670    /* submit command */
1671    tail = sc->sc_ops->op_sq_enter_locked(sc, q, /* XXX ccb */ NULL);
1672
1673    asqe += tail;
1674    bus_dmamap_sync(sc->sc_dmat, NVME_DMA_MAP(q->q_sq_dmamem),
1675        sizeof(*sqe) * tail, sizeof(*sqe), BUS_DMASYNC_POSTWRITE);
1676    *asqe = *sqe;
1677    asqe->cid = cid;
1678    bus_dmamap_sync(sc->sc_dmat, NVME_DMA_MAP(q->q_sq_dmamem),
1679        sizeof(*sqe) * tail, sizeof(*sqe), BUS_DMASYNC_PREWRITE);
1680
1681    sc->sc_ops->op_sq_leave_locked(sc, q, /* XXX ccb */ NULL);
1682
1683    /* wait for completion */
1684    acqe += q->q_cq_head;
1685    for (;;) {
1686        nvme_dmamem_sync(sc, q->q_cq_dmamem, BUS_DMASYNC_POSTREAD);
1687        flags = lemtoh16(&acqe->flags);
1688        if ((flags & NVME_CQE_PHASE) == q->q_cq_phase)
1689            break;
1690
1691        delay(10);
1692    }
1693
1694    if (++q->q_cq_head >= q->q_entries) {
1695        q->q_cq_head = 0;
1696        q->q_cq_phase ^= NVME_CQE_PHASE;
1697    }
1698    nvme_write4(sc, q->q_cqhdbl, q->q_cq_head);
1699    if ((NVME_CQE_SC(flags) != NVME_CQE_SC_SUCCESS) || (acqe->cid != cid))
1700        return (EIO);
1701
1702    return (0);
1703}
1704
1705int
1706nvme_hibernate_io(dev_t dev, daddr_t blkno, vaddr_t addr, size_t size,
1707    int op, void *page)
1708{
1709    struct nvme_hibernate_page {
1710        u_int64_t       prpl[MAXPHYS / PAGE_SIZE];
1711
1712        struct nvme_softc   *sc;
1713        int         nsid;
1714        int         sq_tail;
1715        int         cq_head;
1716        int         cqe_phase;
1717
1718        daddr_t         poffset;
1719        size_t          psize;
1720        u_int32_t       secsize;
1721    } *my = page;
1722    struct nvme_sqe_io *isqe;
1723    struct nvme_cqe *icqe;
1724    paddr_t data_phys, page_phys;
1725    u_int64_t data_bus_phys, page_bus_phys;
1726    u_int16_t flags;
1727    int i;
1728    int error;
1729
1730    if (op == HIB_INIT) {
1731        struct device *disk;
1732        struct device *scsibus;
1733        struct nvm_identify_namespace *ns;
1734        struct nvm_namespace_format *f;
1735        extern struct cfdriver sd_cd;
1736        struct scsi_link *link;
1737        struct scsibus_softc *bus_sc;
1738        struct nvme_sqe_q qsqe;
1739        struct nvme_cqe qcqe;
1740
1741        /* find nvme softc */
1742        disk = disk_lookup(&sd_cd, DISKUNIT(dev));
1743        scsibus = disk->dv_parent;
1744        my->sc = (struct nvme_softc *)disk->dv_parent->dv_parent;
1745
1746        /* find scsi_link, which tells us the target */
1747        my->nsid = 0;
1748        bus_sc = (struct scsibus_softc *)scsibus;
1749        SLIST_FOREACH(link, &bus_sc->sc_link_list, bus_list) {
1750            if (link->device_softc == disk) {
1751                my->nsid = link->target;
1752                break;
1753            }
1754        }
1755        if (my->nsid == 0)
1756            return (EIO);
1757        ns = my->sc->sc_namespaces[my->nsid].ident;
1758        f = &ns->lbaf[NVME_ID_NS_FLBAS(ns->flbas)];
1759
1760        my->poffset = blkno;
1761        my->psize = size;
1762        my->secsize = 1 << f->lbads;
1763
1764        memset(NVME_DMA_KVA(my->sc->sc_hib_q->q_cq_dmamem), 0,
1765            my->sc->sc_hib_q->q_entries * sizeof(struct nvme_cqe));
1766        memset(NVME_DMA_KVA(my->sc->sc_hib_q->q_sq_dmamem), 0,
1767            my->sc->sc_hib_q->q_entries * sizeof(struct nvme_sqe));
1768
1769        my->sq_tail = 0;
1770        my->cq_head = 0;
1771        my->cqe_phase = NVME_CQE_PHASE;
1772
1773        memset(&qsqe, 0, sizeof(qsqe));
1774        qsqe.opcode = NVM_ADMIN_ADD_IOCQ;
1775        htolem64(&qsqe.prp1,
1776            NVME_DMA_DVA(my->sc->sc_hib_q->q_cq_dmamem));
1777        htolem16(&qsqe.qsize, my->sc->sc_hib_q->q_entries - 1);
1778        htolem16(&qsqe.qid, my->sc->sc_hib_q->q_id);
1779        qsqe.qflags = NVM_SQE_CQ_IEN | NVM_SQE_Q_PC;
1780        if (nvme_hibernate_admin_cmd(my->sc, (struct nvme_sqe *)&qsqe,
1781            &qcqe, 1) != 0)
1782            return (EIO);
1783
1784        memset(&qsqe, 0, sizeof(qsqe));
1785        qsqe.opcode = NVM_ADMIN_ADD_IOSQ;
1786        htolem64(&qsqe.prp1,
1787            NVME_DMA_DVA(my->sc->sc_hib_q->q_sq_dmamem));
1788        htolem16(&qsqe.qsize, my->sc->sc_hib_q->q_entries - 1);
1789        htolem16(&qsqe.qid, my->sc->sc_hib_q->q_id);
1790        htolem16(&qsqe.cqid, my->sc->sc_hib_q->q_id);
1791        qsqe.qflags = NVM_SQE_Q_PC;
1792        if (nvme_hibernate_admin_cmd(my->sc, (struct nvme_sqe *)&qsqe,
1793            &qcqe, 2) != 0)
1794            return (EIO);
1795
1796        return (0);
1797    }
1798
1799    if (op != HIB_W)
1800        return (0);
1801
1802    if (blkno + (size / DEV_BSIZE) > my->psize)
1803        return E2BIG;
1804
1805    isqe = NVME_DMA_KVA(my->sc->sc_hib_q->q_sq_dmamem);
1806    isqe += my->sq_tail;
1807    if (++my->sq_tail == my->sc->sc_hib_q->q_entries)
1808        my->sq_tail = 0;
1809
1810    memset(isqe, 0, sizeof(*isqe));
1811    isqe->opcode = NVM_CMD_WRITE;
1812    htolem32(&isqe->nsid, my->nsid);
1813
1814    pmap_extract(pmap_kernel(), addr, &data_phys);
1815    data_bus_phys = data_phys;
1816    htolem64(&isqe->entry.prp[0], data_bus_phys);
1817    if ((size > my->sc->sc_mps) && (size <= my->sc->sc_mps * 2)) {
1818        htolem64(&isqe->entry.prp[1], data_bus_phys + my->sc->sc_mps);
1819    } else if (size > my->sc->sc_mps * 2) {
1820        pmap_extract(pmap_kernel(), (vaddr_t)page, &page_phys);
1821        page_bus_phys = page_phys;
1822        htolem64(&isqe->entry.prp[1], page_bus_phys +
1823            offsetof(struct nvme_hibernate_page, prpl));
1824        for (i = 1; i < howmany(size, my->sc->sc_mps); i++) {
1825            htolem64(&my->prpl[i - 1], data_bus_phys +
1826                (i * my->sc->sc_mps));
1827        }
1828    }
1829
1830    isqe->slba = (blkno + my->poffset) / (my->secsize / DEV_BSIZE);
1831    isqe->nlb = (size / my->secsize) - 1;
1832    isqe->cid = blkno % 0xffff;
1833
1834    nvme_write4(my->sc, NVME_SQTDBL(NVME_HIB_Q, my->sc->sc_dstrd),
1835        my->sq_tail);
1836    nvme_barrier(my->sc, NVME_SQTDBL(NVME_HIB_Q, my->sc->sc_dstrd), 4,
1837        BUS_SPACE_BARRIER_WRITE);
1838
1839    error = 0;
1840
1841    icqe = NVME_DMA_KVA(my->sc->sc_hib_q->q_cq_dmamem);
1842    icqe += my->cq_head;
1843
1844    nvme_dmamem_sync(my->sc, my->sc->sc_hib_q->q_cq_dmamem,
1845        BUS_DMASYNC_POSTREAD);
1846    for (;;) {
1847        flags = lemtoh16(&icqe->flags);
1848        if ((flags & NVME_CQE_PHASE) == my->cqe_phase) {
1849            if ((NVME_CQE_SC(flags) != NVME_CQE_SC_SUCCESS) ||
1850                (icqe->cid != blkno % 0xffff))
1851                error = EIO;
1852
1853            break;
1854        }
1855
1856        delay(1);
1857        nvme_dmamem_sync(my->sc, my->sc->sc_hib_q->q_cq_dmamem,
1858            BUS_DMASYNC_PREREAD|BUS_DMASYNC_POSTREAD);
1859    }
1860    nvme_dmamem_sync(my->sc, my->sc->sc_hib_q->q_cq_dmamem,
1861        BUS_DMASYNC_PREREAD);
1862
1863    if (++my->cq_head == my->sc->sc_hib_q->q_entries) {
1864        my->cq_head = 0;
1865        my->cqe_phase ^= NVME_CQE_PHASE;
1866    }
1867
1868    nvme_write4(my->sc, NVME_CQHDBL(NVME_HIB_Q, my->sc->sc_dstrd),
1869        my->cq_head);
1870    nvme_barrier(my->sc, NVME_CQHDBL(NVME_HIB_Q, my->sc->sc_dstrd), 4,
1871        BUS_SPACE_BARRIER_WRITE);
1872
1873    return (error);
1874}
1875
1876#endif
1877
1878#if NBIO > 0
1879int
1880nvme_bioctl(struct device *self, u_long cmd, caddr_t data)
1881{
1882    struct nvme_softc   *sc = (struct nvme_softc *)self;
1883    struct nvme_pt_cmd  *pt;
1884    int          error = 0;
1885
1886    rw_enter_write(&sc->sc_lock);
1887
1888    switch (cmd) {
1889    case BIOCINQ:
1890        error = nvme_bioctl_inq(sc, (struct bioc_inq *)data);
1891        break;
1892    case BIOCVOL:
1893        error = nvme_bioctl_vol(sc, (struct bioc_vol *)data);
1894        break;
1895    case BIOCDISK:
1896        error = nvme_bioctl_disk(sc, (struct bioc_disk *)data);
1897        break;
1898    case NVME_PASSTHROUGH_CMD:
1899        pt = (struct nvme_pt_cmd *)data;
1900        error = nvme_passthrough_cmd(sc, pt, sc->sc_dev.dv_unit, -1);
1901        break;
1902    default:
1903        printf("nvme_bioctl() Unknown command (%lu)\n", cmd);
1904        error = ENOTTY;
1905    }
1906
1907    rw_exit_write(&sc->sc_lock);
1908
1909    return error;
1910}
1911
1912void
1913nvme_bio_status(struct bio_status *bs, const char *fmt, ...)
1914{
1915    va_list         ap;
1916
1917    va_start(ap, fmt);
1918    bio_status(bs, 0, BIO_MSG_INFO, fmt, &ap);
1919    va_end(ap);
1920}
1921
1922const char *
1923nvme_bioctl_sdname(const struct nvme_softc *sc, int target)
1924{
1925    const struct scsi_link      *link;
1926    const struct sd_softc       *sd;
1927
1928    link = scsi_get_link(sc->sc_scsibus, target, 0);
1929    if (link == NULL)
1930        return NULL;
1931    sd = (struct sd_softc *)(link->device_softc);
1932    if (ISSET(link->state, SDEV_S_DYING) || sd == NULL ||
1933        ISSET(sd->flags, SDF_DYING))
1934        return NULL;
1935
1936    if (nvme_read4(sc, NVME_VS) == 0xffffffff)
1937        return NULL;
1938
1939    return DEVNAME(sd);
1940}
1941
1942int
1943nvme_bioctl_inq(struct nvme_softc *sc, struct bioc_inq *bi)
1944{
1945    char                 sn[41], mn[81], fr[17];
1946    struct nvm_identify_controller  *idctrl = &sc->sc_identify;
1947    struct bio_status       *bs;
1948    unsigned int             nn;
1949    uint32_t             cc, csts, vs;
1950
1951    /* Don't tell bioctl about namespaces > last configured namespace. */
1952    for (nn = sc->sc_nn; nn > 0; nn--) {
1953        if (sc->sc_namespaces[nn].ident)
1954            break;
1955    }
1956    bi->bi_novol = bi->bi_nodisk = nn;
1957    strlcpy(bi->bi_dev, DEVNAME(sc), sizeof(bi->bi_dev));
1958
1959    bs = &bi->bi_bio.bio_status;
1960    bio_status_init(bs, &sc->sc_dev);
1961    bs->bs_status = BIO_STATUS_SUCCESS;
1962
1963    scsi_strvis(sn, idctrl->sn, sizeof(idctrl->sn));
1964    scsi_strvis(mn, idctrl->mn, sizeof(idctrl->mn));
1965    scsi_strvis(fr, idctrl->fr, sizeof(idctrl->fr));
1966
1967    nvme_bio_status(bs, "%s, %s, %s", mn, fr, sn);
1968    nvme_bio_status(bs, "Max i/o %zu bytes%s%s%s, Sanitize 0x%b",
1969        sc->sc_mdts,
1970        ISSET(idctrl->lpa, NVM_ID_CTRL_LPA_PE) ?
1971        ", Persistent Event Log" : "",
1972        ISSET(idctrl->fna, NVM_ID_CTRL_FNA_CRYPTOFORMAT) ?
1973        ", CryptoFormat" : "",
1974        ISSET(idctrl->vwc, NVM_ID_CTRL_VWC_PRESENT) ?
1975        ", Volatile Write Cache" : "",
1976        lemtoh32(&idctrl->sanicap), NVM_ID_CTRL_SANICAP_FMT
1977    );
1978
1979    if (idctrl->ctratt != 0)
1980        nvme_bio_status(bs, "Features 0x%b", lemtoh32(&idctrl->ctratt),
1981            NVM_ID_CTRL_CTRATT_FMT);
1982
1983    if (idctrl->oacs || idctrl->oncs) {
1984        nvme_bio_status(bs, "Admin commands 0x%b, NVM commands 0x%b",
1985            lemtoh16(&idctrl->oacs), NVM_ID_CTRL_OACS_FMT,
1986            lemtoh16(&idctrl->oncs), NVM_ID_CTRL_ONCS_FMT);
1987    }
1988
1989    cc = nvme_read4(sc, NVME_CC);
1990    csts = nvme_read4(sc, NVME_CSTS);
1991    vs = nvme_read4(sc, NVME_VS);
1992
1993    if (vs == 0xffffffff) {
1994        nvme_bio_status(bs, "Invalid PCIe register mapping");
1995        return 0;
1996    }
1997
1998    nvme_bio_status(bs, "NVMe %u.%u%s%s%sabled, %sReady%s%s%s%s",
1999        NVME_VS_MJR(vs), NVME_VS_MNR(vs),
2000        (NVME_CC_CSS_R(cc) == NVME_CC_CSS_NVM) ? ", NVM I/O command set" : "",
2001        (NVME_CC_CSS_R(cc) == 0x7) ? ", Admin command set only" : "",
2002        ISSET(cc, NVME_CC_EN) ? ", En" : "Dis",
2003        ISSET(csts, NVME_CSTS_RDY) ? "" : "Not ",
2004        ISSET(csts, NVME_CSTS_CFS) ? ", Fatal Error, " : "",
2005        (NVME_CC_SHN_R(cc) == NVME_CC_SHN_NORMAL) ? ", Normal shutdown" : "",
2006        (NVME_CC_SHN_R(cc) == NVME_CC_SHN_ABRUPT) ? ", Abrupt shutdown" : "",
2007        ISSET(csts, NVME_CSTS_SHST_DONE) ? " complete" : "");
2008
2009    return 0;
2010}
2011
2012int
2013nvme_bioctl_vol(struct nvme_softc *sc, struct bioc_vol *bv)
2014{
2015    const struct nvm_identify_namespace *idns;
2016    const char              *sd;
2017    int                  target;
2018    unsigned int                 lbaf;
2019
2020    target = bv->bv_volid + 1;
2021    if (target > sc->sc_nn) {
2022        bv->bv_status = BIOC_SVINVALID;
2023        return 0;
2024    }
2025
2026    bv->bv_level = 'c';
2027    bv->bv_nodisk = 1;
2028
2029    idns = sc->sc_namespaces[target].ident;
2030    if (idns == NULL) {
2031        bv->bv_status = BIOC_SVINVALID;
2032        return 0;
2033    }
2034
2035    lbaf = NVME_ID_NS_FLBAS(idns->flbas);
2036    if (idns->nlbaf > 16)
2037        lbaf |= (idns->flbas >> 1) & 0x3f;
2038    bv->bv_size = nvme_scsi_size(idns) << idns->lbaf[lbaf].lbads;
2039
2040    sd = nvme_bioctl_sdname(sc, target);
2041    if (sd) {
2042        strlcpy(bv->bv_dev, sd, sizeof(bv->bv_dev));
2043        bv->bv_status = BIOC_SVONLINE;
2044    } else
2045        bv->bv_status = BIOC_SVOFFLINE;
2046
2047    return 0;
2048}
2049
2050int
2051nvme_bioctl_disk(struct nvme_softc *sc, struct bioc_disk *bd)
2052{
2053    const char          *rpdesc[4] = {
2054        " (Best)",
2055        " (Better)",
2056        " (Good)",
2057        " (Degraded)"
2058    };
2059    const char          *protection[4] = {
2060        "not enabled",
2061        "Type 1",
2062        "Type 2",
2063        "Type 3",
2064    };
2065    char                 buf[32], msg[BIO_MSG_LEN];
2066    struct nvm_identify_namespace   *idns;
2067    struct bio_status       *bs;
2068    uint64_t             id1, id2;
2069    unsigned int             i, lbaf, target;
2070    uint16_t             ms;
2071    uint8_t              dps;
2072
2073    target = bd->bd_volid + 1;
2074    if (target > sc->sc_nn)
2075        return EINVAL;
2076    bd->bd_channel = sc->sc_scsibus->sc_dev.dv_unit;
2077    bd->bd_target = target;
2078    bd->bd_lun = 0;
2079    snprintf(bd->bd_procdev, sizeof(bd->bd_procdev), "Namespace %u", target);
2080
2081    bs = &bd->bd_bio.bio_status;
2082    bs->bs_status = BIO_STATUS_SUCCESS;
2083    snprintf(bs->bs_controller, sizeof(bs->bs_controller), "%11u",
2084        bd->bd_diskid);
2085
2086    idns = sc->sc_namespaces[target].ident;
2087    if (idns == NULL) {
2088        bd->bd_status = BIOC_SDUNUSED;
2089        return 0;
2090    }
2091
2092    lbaf = NVME_ID_NS_FLBAS(idns->flbas);
2093    if (idns->nlbaf > nitems(idns->lbaf))
2094        lbaf |= (idns->flbas >> 1) & 0x3f;
2095    bd->bd_size = lemtoh64(&idns->nsze) << idns->lbaf[lbaf].lbads;
2096
2097    if (memcmp(idns->nguid, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0", 16)) {
2098        memcpy(&id1, idns->nguid, sizeof(uint64_t));
2099        memcpy(&id2, idns->nguid + sizeof(uint64_t), sizeof(uint64_t));
2100        snprintf(bd->bd_serial, sizeof(bd->bd_serial), "%08llx%08llx",
2101            id1, id2);
2102    } else if (memcmp(idns->eui64, "\0\0\0\0\0\0\0\0", 8)) {
2103        memcpy(&id1, idns->eui64, sizeof(uint64_t));
2104        snprintf(bd->bd_serial, sizeof(bd->bd_serial), "%08llx", id1);
2105    }
2106
2107    msg[0] = '\0';
2108    for (i = 0; i <= idns->nlbaf; i++) {
2109        if (idns->lbaf[i].lbads == 0)
2110            continue;
2111        snprintf(buf, sizeof(buf), "%s%s%u",
2112            strlen(msg) ? ", " : "", (i == lbaf) ? "*" : "",
2113            1 << idns->lbaf[i].lbads);
2114        strlcat(msg, buf, sizeof(msg));
2115        ms = lemtoh16(&idns->lbaf[i].ms);
2116        if (ms) {
2117            snprintf(buf, sizeof(buf), "+%u", ms);
2118            strlcat(msg, buf, sizeof(msg));
2119        }
2120        strlcat(msg, rpdesc[idns->lbaf[i].rp], sizeof(msg));
2121    }
2122    nvme_bio_status(bs, "Formats %s", msg);
2123
2124    if (idns->nsfeat)
2125        nvme_bio_status(bs, "Features 0x%b", idns->nsfeat,
2126            NVME_ID_NS_NSFEAT_FMT);
2127
2128    if (idns->dps) {
2129        dps = idns->dps;
2130        snprintf(msg, sizeof(msg), "Data Protection (0x%02x) "
2131            "Protection Data in ", dps);
2132        if (ISSET(dps, NVME_ID_NS_DPS_PIP))
2133            strlcat(msg, "first", sizeof(msg));
2134        else
2135            strlcat(msg, "last", sizeof(msg));
2136        strlcat(msg, "bytes of metadata, Protection ", sizeof(msg));
2137        if (NVME_ID_NS_DPS_TYPE(dps) >= nitems(protection))
2138            strlcat(msg, "Type unknown", sizeof(msg));
2139        else
2140            strlcat(msg, protection[NVME_ID_NS_DPS_TYPE(dps)],
2141                sizeof(msg));
2142        nvme_bio_status(bs, "%s", msg);
2143    }
2144
2145    if (nvme_bioctl_sdname(sc, target) == NULL)
2146        bd->bd_status = BIOC_SDOFFLINE;
2147    else
2148        bd->bd_status = BIOC_SDONLINE;
2149
2150    return 0;
2151}
2152#endif  /* NBIO > 0 */
2153
2154#ifndef SMALL_KERNEL
2155void
2156nvme_refresh_sensors(void *arg)
2157{
2158    struct nvme_softc       *sc = arg;
2159    struct nvme_sqe          sqe;
2160    struct nvme_dmamem      *mem = NULL;
2161    struct nvme_ccb         *ccb = NULL;
2162    struct nvm_smart_health     *health;
2163    uint32_t             dwlen;
2164    uint8_t              cw;
2165    int              flags;
2166    int64_t              temp;
2167
2168    ccb = nvme_ccb_get(sc);
2169    if (ccb == NULL)
2170        goto failed;
2171
2172    mem = nvme_dmamem_alloc(sc, sizeof(*health));
2173    if (mem == NULL)
2174        goto failed;
2175    nvme_dmamem_sync(sc, mem, BUS_DMASYNC_PREREAD);
2176
2177    dwlen = (sizeof(*health) >> 2) - 1;
2178    memset(&sqe, 0, sizeof(sqe));
2179    sqe.opcode = NVM_ADMIN_GET_LOG_PG;
2180    htolem32(&sqe.nsid, 0xffffffff);
2181    htolem32(&sqe.cdw10, (dwlen << 16 | NVM_LOG_PAGE_SMART_HEALTH));
2182    htolem64(&sqe.entry.prp[0], NVME_DMA_DVA(mem));
2183
2184    ccb->ccb_done = nvme_empty_done;
2185    ccb->ccb_cookie = &sqe;
2186    flags = nvme_poll(sc, sc->sc_admin_q, ccb, nvme_sqe_fill, NVME_TIMO_LOG_PAGE);
2187
2188    nvme_dmamem_sync(sc, mem, BUS_DMASYNC_POSTREAD);
2189
2190    if (flags != 0)
2191        goto failed;
2192
2193    health = NVME_DMA_KVA(mem);
2194    cw = health->critical_warning;
2195
2196    sc->sc_temp_sensor.status = (cw & NVM_HEALTH_CW_TEMP) ?
2197        SENSOR_S_CRIT : SENSOR_S_OK;
2198    temp = letoh16(health->temperature);
2199    sc->sc_temp_sensor.value = (temp * 1000000) + 150000;
2200
2201    sc->sc_spare_sensor.status = (cw & NVM_HEALTH_CW_SPARE) ?
2202        SENSOR_S_CRIT : SENSOR_S_OK;
2203    sc->sc_spare_sensor.value = health->avail_spare * 1000;
2204
2205    sc->sc_usage_sensor.status = SENSOR_S_OK;
2206    sc->sc_usage_sensor.value = health->percent_used * 1000;
2207    goto done;
2208
2209 failed:
2210    sc->sc_temp_sensor.status = SENSOR_S_UNKNOWN;
2211    sc->sc_usage_sensor.status = SENSOR_S_UNKNOWN;
2212    sc->sc_spare_sensor.status = SENSOR_S_UNKNOWN;
2213 done:
2214    if (mem != NULL)
2215        nvme_dmamem_free(sc, mem);
2216    if (ccb != NULL)
2217        nvme_ccb_put(sc, ccb);
2218}
2219#endif /* SMALL_KERNEL */
2220