Re: [Intel-IOMMU 06/10] Intel IOMMU driver

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Wed, 06 Jun 2007 11:57:04 -0700
[email protected] wrote:

> 	Actual intel IOMMU driver. Hardware spec can be found at:
> http://www.intel.com/technology/virtualization
> 
> This driver sets X86_64 'dma_ops', so hook into standard DMA APIs. In this way,
> PCI driver will get virtual DMA address. This change is transparent to PCI
> drivers.
> 
> ...
>  
> +#ifdef CONFIG_DMAR
> +	detect_intel_iommu();
> +#endif
> +
>  #ifdef CONFIG_SWIOTLB
>  	pci_swiotlb_init();
>  #endif
> @@ -314,6 +319,10 @@
>  	calgary_iommu_init();
>  #endif
>  
> +#ifdef CONFIG_DMAR
> +	intel_iommu_init();
> +#endif

We'd prefer that the header file have suitable #ifndef CONFIG_DMAR stubs,
so the ifdefs here become unneeded.

> +/* context entry handling */
> +static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
> +		u8 bus, u8 devfn)
> +{
> +	struct root_entry *root;
> +	struct context_entry *context;
> +	unsigned long phy_addr;
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&iommu->lock, flags);
> +	root = &iommu->root_entry[bus];
> +	if (!(context = get_context_addr_from_root(*root))) {
> +		context = (struct context_entry *)alloc_pgtable_page();
> +		if (!context) {
> +			spin_unlock_irqrestore(&iommu->lock, flags);
> +			return NULL;
> +		}
> +		__iommu_flush_cache(iommu, (void *)context, PAGE_SIZE_4K);
> +		phy_addr = virt_to_phys((void *)context);
> +		set_root_value(*root, phy_addr);
> +		set_root_present(*root);
> +		__iommu_flush_cache(iommu, root, sizeof(*root));
> +	}
> +	spin_unlock_irqrestore(&iommu->lock, flags);
> +	return &context[devfn];
> +}

checkpatch.pl has lots of fun with this patch.

> +/* page table handling */
> +#define LEVEL_STRIDE		(9)
> +#define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
> +#define agaw_to_level(val) ((val) + 2)
> +#define agaw_to_width(val) (30 + val * LEVEL_STRIDE)
> +#define width_to_agaw(w)  ((w - 30)/LEVEL_STRIDE)
> +#define level_to_offset_bits(l) (12 + (l - 1) * LEVEL_STRIDE)
> +#define address_level_offset(addr, level) \
> +	((addr >> level_to_offset_bits(level)) & LEVEL_MASK)
> +#define level_mask(l) (((u64)(-1)) << level_to_offset_bits(l))
> +#define level_size(l) ((u64)1 << level_to_offset_bits(l))
> +#define align_to_level(addr, l) ((addr + level_size(l) - 1) & level_mask(l))

static inlines are better than macros - please consider.

If you're going to stick with macros here then you'll find that many of the
above macro's arguments are insufficiently parenthesised.

> +#define IOMMU_WAIT_OP(iommu, offset, op, cond, sts) \
> +{\
> +	unsigned long start_time = jiffies;\
> +	while (1) {\
> +		sts = op (iommu->reg, offset);\
> +		if (cond)\
> +			break;\
> +		if (time_after(jiffies, start_time + DMAR_OPERATION_TIMEOUT))\
> +			panic("DMAR hardware is malfunctional, please disable IOMMU\n");\
> +		cpu_relax();\
> +	}\
> +}

wow, harsh treatment.

"malfunctioning" might parse better here.

> +static int inline get_alignment(u64 base, unsigned int size)
> +{
> +	int t = 0;
> +	u64 end;
> +
> +	end = base + size - 1;
> +	while (base != end) {
> +		t++;
> +		base >>= 1;
> +		end >>= 1;
> +	}
> +	return t;
> +}

What's this (too large to inline) function doing?  I suspect we might have
helper functions which already do it...  If not, perhasp we should.


> +static int inline iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
> +	u64 addr, unsigned int pages, int non_present_entry_flush)
> +{
> +	unsigned int align;
> +
> +	BUG_ON(addr & (~PAGE_MASK_4K));
> +	BUG_ON(pages == 0);
> +
> +	/* Fallback to domain selective flush if no PSI support */
> +	if (!cap_pgsel_inv(iommu->cap))
> +		return iommu_flush_iotlb_dsi(iommu, did,
> +			non_present_entry_flush);
> +
> +	/*
> +	 * PSI requires page size is 2 ^ x, and the base address is naturally
> +	 * aligned to the size
> +	 */
> +	align = get_alignment(addr >> PAGE_SHIFT_4K, pages);
> +	/* Fallback to domain selective flush if size is too big */
> +	if (align > cap_max_amask_val(iommu->cap))
> +		return iommu_flush_iotlb_dsi(iommu, did,
> +			non_present_entry_flush);
> +
> +	addr >>= PAGE_SHIFT_4K + align;
> +	addr <<= PAGE_SHIFT_4K + align;
> +
> +	return __iommu_flush_iotlb(iommu, did, addr, align,
> +		DMA_TLB_PSI_FLUSH, non_present_entry_flush);
> +}

too large for inlining.

> +static int iommu_enable_translation(struct intel_iommu *iommu)
> +{
> +	u32 sts;
> +	unsigned long flag;

we conventionally use "flags" for this.

> +	spin_lock_irqsave(&iommu->register_lock, flag);
> +	dmar_writel(iommu->reg, DMAR_GCMD_REG, iommu->gcmd|DMA_GCMD_TE);
> +
> +	/* Make sure hardware complete it */
> +	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl, (sts & DMA_GSTS_TES), sts);
> +
> +	iommu->gcmd |= DMA_GCMD_TE;
> +	spin_unlock_irqrestore(&iommu->register_lock, flag);
> +	return 0;
> +}
>
> ...
>
>
> +#define aligned_size(host_addr, size) \
> +	PAGE_ALIGN_4K((host_addr & (~PAGE_MASK_4K)) + size)

insufficiently parenthesized.  Consider usign a static inline.

> +struct dma_mapping_ops intel_dma_ops = {
> +	.alloc_coherent = intel_alloc_coherent,
> +	.free_coherent = intel_free_coherent,
> +	.map_single = intel_map_single,
> +	.unmap_single = intel_unmap_single,
> +	.map_sg = intel_map_sg,
> +	.unmap_sg = intel_unmap_sg,
> +};

can it be static?

> +void *iommu_rpool_alloc(unsigned int size, gfp_t flag)
> +{
> +	if (size == PAGE_SIZE_4K)
> +		return(void *)get_zeroed_page(flag);
> +	else
> +		return kzalloc(size, flag);
> +}

kmalloc(4k) is pretty efficient and (I think) is guaranteed to return a
page-aligned address.

iow: can we just use kmalloc here?

> +
> +static inline int
> +iommu_devinfo_pool_init(void)
> +{
> +	return init_resource_pool(&iommu_devinfo_pool, MIN_DEVINFO_REQ,
> +		sizeof(struct device_domain_info),
> +		GROW_DEVINFO_REQ, iommu_rpool_alloc,
> +		iommu_rpool_free);
> +}
> +
> +static inline int
> +iommu_iova_pool_init(void)
> +{
> +	return init_resource_pool(&iommu_iova_pool, MIN_IOVA_REQ,
> +		sizeof(struct iova),
> +		GROW_IOVA_REQ, iommu_rpool_alloc, iommu_rpool_free);
> +}
> +
> +static int iommu_init_mempool(void)
> +{
> +	int ret;
> +	ret = iommu_iova_pool_init();
> +	if (ret)
> +		return ret;
> +
> +	ret = iommu_pgtable_pool_init();
> +	if (ret)
> +		goto pgtable_error;
> +
> +	ret = iommu_domain_pool_init();
> +	if (ret)
> +		goto domain_error;
> +
> +	ret = iommu_devinfo_pool_init();
> +	if (!ret)
> +		return ret;
> +
> +	destroy_resource_pool(&iommu_domain_pool);
> +domain_error:
> +	destroy_resource_pool(&iommu_pgtable_pool);
> +pgtable_error:
> +	destroy_resource_pool(&iommu_iova_pool);
> +
> +	return -ENOMEM;
> +}

can be __init

> +static void iommu_exit_mempool(void)
> +{
> +	destroy_resource_pool(&iommu_devinfo_pool);
> +	destroy_resource_pool(&iommu_domain_pool);
> +	destroy_resource_pool(&iommu_pgtable_pool);
> +	destroy_resource_pool(&iommu_iova_pool);
> +}

ditto (unexpectedly)

> +void __init detect_intel_iommu(void)
> +{
> +	if (swiotlb || no_iommu || iommu_detected || dmar_disabled)
> +		return;
> +	if (early_dmar_detect()) {
> +		iommu_detected = 1;
> +	}
> +}
> +
> +static void __init init_no_remapping_devices(void)
> +{
> +	struct dmar_drhd_unit *drhd;
> +
> +	for_each_drhd_unit(drhd)
> +		if (!drhd->include_all) {
> +			int i;
> +			for (i=0; i < drhd->devices_cnt; i++)
> +				if (drhd->devices[i] != NULL)
> +					break;
> +			/* ignore DMAR unit if no pci devices exist */
> +			if (i == drhd->devices_cnt)
> +				drhd->ignored = 1;
> +		}

looks weird - I'd add the extra braces here.

> +	if (dmar_map_gfx)
> +		return;
> +
> +	for_each_drhd_unit(drhd) {
> +		int i;
> +		if (drhd->ignored || drhd->include_all)
> +			continue;
> +
> +		for (i = 0; i < drhd->devices_cnt; i++)
> +			if (drhd->devices[i] && !IS_GFX_DEVICE(drhd->devices[i]))
> +				break;
> +
> +		if (i < drhd->devices_cnt)
> +			continue;
> +
> +		/* bypass IOMMU if it is just for gfx devices */
> +		drhd->ignored = 1;
> +		for (i = 0; i < drhd->devices_cnt; i++) {
> +			if (!drhd->devices[i])
> +				continue;
> +			drhd->devices[i]->sysdata = DUMMY_DEVICE_DOMAIN_INFO;
> +		}
> +	}
> +}
> +
>
> ...
>
> +#define OFFSET_STRIDE		(9)
> +#define dmar_readl(dmar, reg) readl(dmar + reg)
> +#define dmar_writel(dmar, reg, val) writel((val), dmar + reg)

Is this pointless obfuscation?

> +#define dmar_readq(dmar, reg) ({ \
> +		u32 lo, hi; \
> +		lo = dmar_readl(dmar, reg); \
> +		hi = dmar_readl(dmar, reg + 4); \
> +		(((u64) hi) << 32) + lo; })
> +#define dmar_writeq(dmar, reg, val) do {\
> +		dmar_writel(dmar, reg, (u32)(val)); \
> +		dmar_writel(dmar, reg + 4, (u32)((val) >> 32)); \
> +	} while (0)

Do these need to be macros?  If not, a regular C function would be better. 
Not necessarily an inlined one, either.

> +#define VER_MAJOR(v)		(((v) & 0xf0) >> 4)
> +#define VER_MINOR(v)		((v) & 0x0f)

We already have several VER_MAJORs defined in the tree, so adding a new one
is asking for trouble.  Suggest the use of a more specific identifier.

> +#define set_root_value(root, value) \
> +	do {(root).val |= ((value) & PAGE_MASK_4K);} while(0)

Methinks that could be written in C too.

> +struct domain {

hm, "domain" is a pretty generic term.  I think there's a bit of namespace
hogging here..

> +	int	id;			/* domain id */
> +	struct intel_iommu *iommu;	/* back pointer to owning iommu */
> +
> +	struct list_head devices; 	/* all devices' list */
> +	struct iova_domain iovad;	/* iova's that belong to this domain */
> +
> +	struct dma_pte	*pgd;		/* virtual address */
> +	spinlock_t	mapping_lock;	/* page table lock */
> +	int		gaw;		/* max guest address width */
> +	int		agaw;		/* adjusted guest address width, 0 is level 2 30-bit */
> +
> +#define DOMAIN_FLAG_MULTIPLE_DEVICES 1
> +	int		flags;
> +};
> +
>
> ...
>
> +#define for_each_drhd_unit(drhd) \
> +	list_for_each_entry(drhd, &dmar_drhd_units, list)
> +#define for_each_rmrr_units(rmrr) \
> +	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
> +#define begin_for_each_rmrr_device(rmrr, pdev) \
> +	for_each_rmrr_units(rmrr) { \
> +		int _i; \
> +		for (_i = 0; _i < rmrr->devices_cnt; _i++) { \
> +			pdev = rmrr->devices[_i]; \
> +			/* some BIOS lists non-exist devices in DMAR table */\
> +			if (!pdev) \
> +				continue;
> +#define end_for_each_rmrr_device(rmrr, pdev) \
> +		} \
> +	}
> +

Are these used often enough to justify their inclusion?

Would it be possible to implement these as regular C functions which are
passed the address of a callback function?

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[Index of Archives]     [Kernel Newbies]     [Netfilter]     [Bugtraq]     [Photo]     [Stuff]     [Gimp]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Video 4 Linux]     [Linux for the blind]     [Linux Resources]
  Powered by Linux