Re: [PATCH 3/6] i386 virtualization - Make ldt a desc struct

From: Chris Wright (chrisw_at_osdl.org)
Date: 08/16/05

  • Next message: Michael E Brown: "Re: [RFC][PATCH 2.6.13-rc6] add Dell Systems Management Base Driver (dcdbas) with sysfs support"
    Date:	Mon, 15 Aug 2005 22:23:52 -0700
    To: zach@vmware.com
    
    

    * zach@vmware.com (zach@vmware.com) wrote:
    > Make the LDT a desc_struct pointer, since this is what it actually is.

    I like that plan.

    > There is code which relies on the fact that LDTs are allocated in page
    > chunks, and it is both cleaner and more convenient to keep the rather
    > poorly named "size" variable from the LDT in terms of LDT pages.

    I noticed it's replaced by context.ldt and context.ldt_pages, which
    appear to be decoupling the overloaded use from before. Looks sane.
    More comments below.

    > Signed-off-by: Zachary Amsden <zach@vmware.com>
    > Index: linux-2.6.13/include/asm-i386/mmu.h
    > ===================================================================
    > --- linux-2.6.13.orig/include/asm-i386/mmu.h 2005-08-15 11:16:59.000000000 -0700
    > +++ linux-2.6.13/include/asm-i386/mmu.h 2005-08-15 11:19:49.000000000 -0700
    > @@ -9,9 +9,9 @@
    > * cpu_vm_mask is used to optimize ldt flushing.
    > */
    > typedef struct {
    > - int size;
    > struct semaphore sem;
    > - void *ldt;
    > + struct desc_struct *ldt;
    > + int ldt_pages;
    > } mm_context_t;
    >
    > #endif
    > Index: linux-2.6.13/include/asm-i386/desc.h
    > ===================================================================
    > --- linux-2.6.13.orig/include/asm-i386/desc.h 2005-08-15 11:16:59.000000000 -0700
    > +++ linux-2.6.13/include/asm-i386/desc.h 2005-08-15 11:19:49.000000000 -0700
    > @@ -6,6 +6,9 @@
    >
    > #define CPU_16BIT_STACK_SIZE 1024
    >
    > +/* The number of LDT entries per page */
    > +#define LDT_ENTRIES_PER_PAGE (PAGE_SIZE / LDT_ENTRY_SIZE)
    > +
    > #ifndef __ASSEMBLY__
    >
    > #include <linux/preempt.h>
    > @@ -30,7 +33,7 @@
    > static inline unsigned long get_desc_base(struct desc_struct *desc)
    > {
    > unsigned long base;
    > - base = ((desc->a >> 16) & 0x0000ffff) |
    > + base = (desc->a >> 16) |

    Seemingly unrelated.

    > ((desc->b << 16) & 0x00ff0000) |
    > (desc->b & 0xff000000);
    > return base;
    > @@ -171,7 +174,7 @@
    > static inline void load_LDT_nolock(mm_context_t *pc, int cpu)
    > {
    > void *segments = pc->ldt;
    > - int count = pc->size;
    > + int count = pc->ldt_pages * LDT_ENTRIES_PER_PAGE;
    >
    > if (likely(!count)) {
    > segments = &default_ldt[0];
    > Index: linux-2.6.13/include/asm-i386/mmu_context.h
    > ===================================================================
    > --- linux-2.6.13.orig/include/asm-i386/mmu_context.h 2005-08-15 11:16:59.000000000 -0700
    > +++ linux-2.6.13/include/asm-i386/mmu_context.h 2005-08-15 11:19:49.000000000 -0700
    > @@ -19,7 +19,7 @@
    > memset(&mm->context, 0, sizeof(mm->context));
    > init_MUTEX(&mm->context.sem);
    > old_mm = current->mm;
    > - if (old_mm && unlikely(old_mm->context.size > 0)) {
    > + if (old_mm && unlikely(old_mm->context.ldt)) {
    > retval = copy_ldt(&mm->context, &old_mm->context);
    > }
    > if (retval == 0)
    > @@ -32,7 +32,7 @@
    > */
    > static inline void destroy_context(struct mm_struct *mm)
    > {
    > - if (unlikely(mm->context.size))
    > + if (unlikely(mm->context.ldt))
    > destroy_ldt(mm);
    > del_lazy_mm(mm);
    > }
    > Index: linux-2.6.13/include/asm-i386/mach-default/mach_desc.h
    > ===================================================================
    > --- linux-2.6.13.orig/include/asm-i386/mach-default/mach_desc.h 2005-08-15 11:16:59.000000000 -0700
    > +++ linux-2.6.13/include/asm-i386/mach-default/mach_desc.h 2005-08-15 11:19:49.000000000 -0700
    > @@ -62,11 +62,10 @@
    > _set_tssldt_desc(&get_cpu_gdt_table(cpu)[GDT_ENTRY_LDT], (int)addr, ((size << 3)-1), 0x82);
    > }
    >
    > -static inline int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b)
    > +static inline int write_ldt_entry(struct desc_struct *ldt, int entry, __u32 entry_a, __u32 entry_b)
    > {
    > - __u32 *lp = (__u32 *)((char *)ldt + entry*8);
    > - *lp = entry_a;
    > - *(lp+1) = entry_b;
    > + ldt[entry].a = entry_a;
    > + ldt[entry].b = entry_b;
    > return 0;
    > }
    >
    > Index: linux-2.6.13/arch/i386/kernel/ptrace.c
    > ===================================================================
    > --- linux-2.6.13.orig/arch/i386/kernel/ptrace.c 2005-08-15 11:16:59.000000000 -0700
    > +++ linux-2.6.13/arch/i386/kernel/ptrace.c 2005-08-15 11:19:49.000000000 -0700
    > @@ -164,18 +164,20 @@
    > * and APM bios ones we just ignore here.
    > */
    > if (segment_from_ldt(seg)) {
    > - u32 *desc;
    > + mm_context_t *context;
    > + struct desc_struct *desc;
    > unsigned long base;
    >
    > - down(&child->mm->context.sem);
    > - desc = child->mm->context.ldt + (seg & ~7);
    > - base = (desc[0] >> 16) | ((desc[1] & 0xff) << 16) | (desc[1] & 0xff000000);
    > + context = &child->mm->context;
    > + down(&context->sem);
    > + desc = &context->ldt[segment_index(seg)];
    > + base = get_desc_base(desc);
    >
    > /* 16-bit code segment? */
    > - if (!((desc[1] >> 22) & 1))
    > + if (!get_desc_32bit(desc))
    > addr &= 0xffff;
    > addr += base;
    > - up(&child->mm->context.sem);
    > + up(&context->sem);
    > }
    > return addr;
    > }
    > Index: linux-2.6.13/arch/i386/kernel/ldt.c
    > ===================================================================
    > --- linux-2.6.13.orig/arch/i386/kernel/ldt.c 2005-08-15 11:16:59.000000000 -0700
    > +++ linux-2.6.13/arch/i386/kernel/ldt.c 2005-08-15 11:19:49.000000000 -0700
    > @@ -28,28 +28,27 @@
    > }
    > #endif
    >
    > -static inline int alloc_ldt(mm_context_t *pc, const int oldsize, int mincount, const int reload)
    > +static inline int alloc_ldt(mm_context_t *pc, const int old_pages, int new_pages, const int reload)
    > {
    > - void *oldldt;
    > - void *newldt;
    > + struct desc_struct *oldldt;
    > + struct desc_struct *newldt;
    >

    Not quite related here (since change was introduced in earlier patch),
    but old alloc_ldt special cased when room was available. This is gone,
    so am I reading this correctly, each time through it will allocate a
    new one, and free the old one (if it existed)? Just double checking
    that change doesn't introduce possible mem leak.

    > - mincount = (mincount+511)&(~511);
    > - if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
    > - newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
    > + if (new_pages > 1)
    > + newldt = vmalloc(new_pages*PAGE_SIZE);
    > else
    > - newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
    > + newldt = kmalloc(PAGE_SIZE, GFP_KERNEL);

    If so, then full page is likely to be reusable in common case, no? (If
    there's such a thing as LDT common case ;-)

    > if (!newldt)
    > return -ENOMEM;
    >
    > - if (oldsize)
    > - memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
    > + if (old_pages)
    > + memcpy(newldt, pc->ldt, old_pages*PAGE_SIZE);
    > oldldt = pc->ldt;
    > if (reload)
    > - memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
    > + memset(newldt+old_pages*LDT_ENTRIES_PER_PAGE, 0, (new_pages-old_pages)*PAGE_SIZE);

    In fact, I _think_ this causes a problem. Who says newldt is bigger
    than old one? This looks like user-triggerable oops to me.

    > pc->ldt = newldt;
    > wmb();
    > - pc->size = mincount;
    > + pc->ldt_pages = new_pages;
    > wmb();
    >
    > /*
    > @@ -60,20 +59,20 @@
    > #ifdef CONFIG_SMP
    > cpumask_t mask;
    > preempt_disable();
    > - SetPagesLDT(pc->ldt, (pc->size * LDT_ENTRY_SIZE) / PAGE_SIZE);
    > + SetPagesLDT(pc->ldt, pc->ldt_pages);
    > load_LDT(pc);
    > mask = cpumask_of_cpu(smp_processor_id());
    > if (!cpus_equal(current->mm->cpu_vm_mask, mask))
    > smp_call_function(flush_ldt, NULL, 1, 1);
    > preempt_enable();
    > #else
    > - SetPagesLDT(pc->ldt, (pc->size * LDT_ENTRY_SIZE) / PAGE_SIZE);
    > + SetPagesLDT(pc->ldt, pc->ldt_pages);
    > load_LDT(pc);
    > #endif
    > }
    > - if (oldsize) {
    > - ClearPagesLDT(oldldt, (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE);
    > - if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
    > + if (old_pages) {
    > + ClearPagesLDT(oldldt, old_pages);
    > + if (old_pages > 1)
    > vfree(oldldt);
    > else
    > kfree(oldldt);
    > @@ -86,10 +85,10 @@
    > int err;
    >
    > down(&old->sem);
    > - err = alloc_ldt(new, 0, old->size, 0);
    > + err = alloc_ldt(new, 0, old->ldt_pages, 0);
    > if (!err) {
    > - memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
    > - SetPagesLDT(new->ldt, (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE);
    > + memcpy(new->ldt, old->ldt, old->ldt_pages*PAGE_SIZE);
    > + SetPagesLDT(new->ldt, new->ldt_pages);
    > }
    > up(&old->sem);
    > return err;
    > @@ -97,14 +96,16 @@
    >
    > void destroy_ldt(struct mm_struct *mm)
    > {
    > + int pages = mm->context.ldt_pages;
    > +
    > if (mm == current->active_mm)
    > clear_LDT();
    > - ClearPagesLDT(mm->context.ldt, (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE);
    > - if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
    > + ClearPagesLDT(mm->context.ldt, pages);
    > + if (pages > 1)
    > vfree(mm->context.ldt);
    > else
    > kfree(mm->context.ldt);
    > - mm->context.size = 0;
    > + mm->context.ldt_pages = 0;
    > }
    >
    > static int read_ldt(void __user * ptr, unsigned long bytecount)
    > @@ -113,13 +114,13 @@
    > unsigned long size;
    > struct mm_struct * mm = current->mm;
    >
    > - if (!mm->context.size)
    > + if (!mm->context.ldt_pages)
    > return 0;
    > if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
    > bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
    >
    > down(&mm->context.sem);
    > - size = mm->context.size*LDT_ENTRY_SIZE;
    > + size = mm->context.ldt_pages*PAGE_SIZE;
    > if (size > bytecount)
    > size = bytecount;

    This now looks like you can leak data? Since full page is unlikely
    used, but accounting is done in page sizes. Asking to read_ldt with
    bytcount of PAGE_SIZE could give some uninitialzed data back to user.
    Did I miss the spot where this is always zero-filled?

    > @@ -166,6 +167,7 @@
    > __u32 entry_1, entry_2;
    > int error;
    > struct user_desc ldt_info;
    > + int page_number;
    >
    > error = -EINVAL;
    > if (bytecount != sizeof(ldt_info))
    > @@ -184,10 +186,11 @@
    > goto out;
    > }
    >
    > + page_number = ldt_info.entry_number / LDT_ENTRIES_PER_PAGE;
    > down(&mm->context.sem);
    > - if (ldt_info.entry_number >= mm->context.size) {
    > - error = alloc_ldt(&current->mm->context, mm->context.size,
    > - ldt_info.entry_number+1, 1);
    > + if (page_number >= mm->context.ldt_pages) {
    > + error = alloc_ldt(&current->mm->context, mm->context.ldt_pages,
    > + page_number+1, 1);
    > if (error < 0)
    > goto out_unlock;
    > }
    > Index: linux-2.6.13/arch/i386/kernel/kprobes.c
    > ===================================================================
    > --- linux-2.6.13.orig/arch/i386/kernel/kprobes.c 2005-08-15 11:16:59.000000000 -0700
    > +++ linux-2.6.13/arch/i386/kernel/kprobes.c 2005-08-15 11:19:49.000000000 -0700
    > @@ -164,8 +164,7 @@
    > */
    > if (segment_from_ldt(regs->xcs) && (current->mm)) {
    > struct desc_struct *desc;
    > - desc = (struct desc_struct *) ((char *) current->mm->context.ldt +
    > - (segment_index(regs->xcs) * 8));
    > + desc = &current->mm->context.ldt[segment_index(regs->xcs)];
    > addr = (kprobe_opcode_t *) (get_desc_base(desc) + regs->eip -
    > sizeof(kprobe_opcode_t));
    > } else {
    >
    -
    To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
    the body of a message to majordomo@vger.kernel.org
    More majordomo info at http://vger.kernel.org/majordomo-info.html
    Please read the FAQ at http://www.tux.org/lkml/


  • Next message: Michael E Brown: "Re: [RFC][PATCH 2.6.13-rc6] add Dell Systems Management Base Driver (dcdbas) with sysfs support"

    Relevant Pages

    • Re: IA32 | [OFFTOPIC]: Segementation Question
      ... > I'm currently doing some research on the IA32 Segementation Concept. ... To the GDT or the current LDT? ... send the line "unsubscribe linux-kernel" in ...
      (Linux-Kernel)
    • Re: 2.4, 2.6, i686/athlon and LDTs
      ... > descriptor regardless of if it came from the GDT or LDT. ... do't confuse people with "does anyone have comparison information ... send the line "unsubscribe linux-kernel" in ...
      (Linux-Kernel)
    • Re: [PATCH 0/6] i386 virtualization patches, Set 3
      ... > This round attempts to conclude all of the LDT related cleanup with some ... > really rather nasty kprobes problems, and the basic framework for an LDT ... > nice new segment and descriptor table accessors, ... send the line "unsubscribe linux-kernel" in ...
      (Linux-Kernel)
    • Re: [uml-devel] [PATCH 8/10] UML - Maintain own LDT entries
      ... > Patch imlements full LDT handling in SKAS: ... *) It reverts my cleanup and consolidation of ldt.c wrt. SKAS vs TT. ... send the line "unsubscribe linux-kernel" in ...
      (Linux-Kernel)