/* 
 * Prospect: a developer's system profiler.
 *
 * COPYRIGHT (C) 2001-2004 Hewlett-Packard Company
 *
 * Author: Alex Tsariounov, HP
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the Free
 * Software Foundation; either version 2 of the License, or (at your option)
 * any later version.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
 * more details.
 *
 * You should have received a copy of the GNU General Public License along with
 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
 * Place - Suite 330, Boston, MA 02111-1307, USA.
 */

/* $Id: linux_module.c,v 1.56 2004/01/13 00:37:16 type2 Exp $ */

/*
 *******************************************************************************
 *
 *                            PROSPECT PROJECT
 *           Linux Loadable Module Based Trace Processing Module
 *
 *     Note: some definitions lifted from oprofile code base, which is 
 *     Copyright 2001-2003 Oprofile authors.  See http://oprofile.sf.net.
 *
 *******************************************************************************
 */

#ifndef __LINT__
static const char gRCSid[] = "@(#) $Id: linux_module.c,v 1.56 2004/01/13 00:37:16 type2 Exp $";
#endif

/*
 * System Header Files
 */
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <signal.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/time.h>

#ifdef __ia64__
/* Define these here while waiting for libc6.1-dev fixes */
typedef __s8 s8;
typedef __u8 u8;
typedef __s16 s16;
typedef __u16 u16;
typedef __s32 s32;
typedef __u32 u32;
typedef __s64 s64;
typedef __u64 u64;
#endif

/*
 * Prospect Header Files
 */
#include "prospect.h"
#include "linux_module.h"
#include "linux_model.h"
#include "rec_proc.h"
#include "version.h"
#include "bfile.h"
#include "ascii_report.h"

/*
 * Exported global data.
 */
op_ctl_t gOp;

/*
 * ----------------- Static functions in this file ----------------
 */
static int run_insmod(char *modname);
static ssize_t op_read_device(int fh, void *buf, size_t size);
static void op_process_sample(ssize_t idx);
static void op_fork(ssize_t idx);
static void op_exec(pid_t pid);
static void op_exit(ssize_t idx);
static void op_map(ssize_t idx);
static void op_drop_modules(ssize_t idx);

/*
 * ----------------- Oprofile "static" definitions ---------------------
 */

/* The hash map for file names layout has an array of index structures
 * followed by a string pool.  String pool length will have to track
 * oprofile's op_user.h.
 */
#define OP_HASH_LINE_NR 4093
#define POOL_SIZE 65536
#define GET_STRING(map, idx) ((char *)((map) + OP_HASH_LINE_NR) + (idx))

struct op_hash_index {
    unsigned int name;
    unsigned int parent;
} __attribute__((__packed__));

/* hash map size, but could be different */
#define OP_HASH_MAP_SIZE (OP_HASH_LINE_NR * sizeof(struct op_hash_index) +\
    POOL_SIZE)

/* sample record */
struct op_sample {
    unsigned long eip;
    unsigned int counter;
    unsigned int pid;
    unsigned int tgid;
};

/* sample buffer header */
struct op_buffer_head {
    int cpu_nr;
    size_t count;
    enum oprof_state state;
    struct op_sample buffer[0];
} __attribute__((__packed__));

/* notification record */
struct op_note {
    unsigned long   addr;
    unsigned long   len;
    unsigned long   offset;
    unsigned int    hash;
    unsigned int    pid;
    unsigned int    tgid; 
    unsigned short  type;
};

/* Definition and macros for notifications */
#define OP_FORK 1
#define OP_MAP 2
#define OP_EXEC 4
#define OP_DROP_MODULES 8
#define OP_EXIT 16

/* Counter bitfield in 0.2/0.3 */
#define OP_BITS 2
#define OP_BITS_COUNT (16-OP_BITS)
#define OP_BITS_OFF_MASK ~(3 << OP_BITS_COUNT)

/* -------------------  Exported Functions ---------------------- */

/*
 * int op_init(void)
 *
 * Initialize the oprofile sampling module.
 */
int 
op_init(void)
{
#define BUFSZ 256
    struct stat sbuf;
    char modnamestr[MAXPATHLEN], buf[BUFSZ];
    size_t ret;
    FILE *strm;
    unsigned int uval;
    int op_major, result, oprof_loaded, foundit;

    mINFORM("In op_init()");

    /*
     * First, check that module is loaded - load it if not.
     */ 
    oprof_loaded = FALSE;
    strm = fopen("/proc/devices", "r");
    while(fgets(buf, BUFSZ, strm)) {
        int len;
        if (strstr(buf, "oprof")) {
            /* find the correct device major to use */
            len = sscanf(buf, "%d", &op_major);
            if (len!=1) {
                ferr("couldn't scanf \"%s\" for op_major\n", buf);
                fclose(strm);
                return 1;
            }
            oprof_loaded = TRUE;
            break;
        }
    }
    fclose(strm);

    /* Attempt loading module if not found in /proc/devices */
    if (!oprof_loaded) {
        mINFORM(" driver \"oprof\" not in /proc/devices, trying load...");
        /* Check that the module is installed in standard place */
        sprintf(modnamestr,"/lib/modules/%s/%s/%s.o",
                gConf.my_utsname.release, op_moddir, op_modname);
        if (stat(modnamestr, &sbuf)) {
            mINFORM(" stat of %s failed.", modnamestr);
            ferr("couldn't find \"%s\"\n",  modnamestr);
            ferr_nt(
"\n"
" NOTE: Prospect depends on a functional oprofile installation.  You can get\n"
"       oprofile from http://oprofile.sf.net. However, make sure to read the\n"
"       prospect README.INSTALL file since you may already have an oprofile\n"
"       distribution that was delivered with prospect, but not installed.\n"
"       Check the /usr/share/doc/prospect[-" cREV "] directory.\n");
            prospect_exit(1);
            return 1;
        }

        /* If /proc/ksyms does not have sys_call_table exported, and the 
         * root check is on (i.e., a user-supplied system.map, or found in
         * /lib/modules/`uname -r`/build), then check that we are root.
         */
        if (!gOp.has_sys_call_table && gOp.ckroot) {
            if (!amroot()) {
                ferr("permission denied for user-supplied System.map\n");
                ferr_nt(
"\n"
" Please note:  You have supplied a System.map with -K, or I have found one\n"
"    from the build link in /lib/modules/.../build.  Since a kernel is\n"
"    running that does not export sys_call_table, your real user id is not\n"
"    root, and I have to load a module that counteracts this non-export, the\n"
"    situation is not allowed.  Either have the root user copy the correct\n"
"    System.map file into /boot, or run prospect as root.\n");
                prospect_exit(1);
            }
        }
        
        /* Check that sys_call_table and sys_mmap2 have values */
        if (!gOp.has_sys_call_table) {
            int err=0;
            if (gOp.addr_sys_call_table == 0)  {
                ferr("value of sys_call_table from System.map not found\n");
                err++;
            }
            if (gOp.addr_sys_mmap2 == 0 && err++) {
                ferr("value of sys_mmap2 from System.map was not found\n");
                err++;
            }
            if (err) prospect_exit(1);
        }

        /* Now, attempt to load module */
        if((ret=run_insmod(modnamestr))!= 0) {
            ferr("module load with /sbin/insmod problem\n");
            hint("Execute \"dmesg | tail\" for possible reason\n");
            hint("Try \"modprobe oprofile allow_unload=1\"\n");
            if (!gOp.has_sys_call_table)
               hint("Is the /boot/System.map-`uname -r` file correct?\n");
            hint("Unload rtc module if loaded and rtc mode in use\n");
            hint("RH 8.0 on P4: append 'nortc' to kernel command\n"
                 "      line in grub/lilo\n"
                );
            return 1;
        }
        /* check again that it's loaded */
        strm = fopen("/proc/modules", "r");
        if (!strm) {
            perr("Couldn't open /proc/modules");
            return 1;
        }
        foundit=FALSE;
        while (fgets(buf, BUFSZ, strm)) {
            if (strncmp(buf, op_modname, strlen(op_modname)) == 0) {
                foundit=TRUE;
                break;
            }
        }
        fclose(strm);
        if (!foundit) {
            ferr("can't find module %s in kernel after insmod\n", op_modname);
            return 1;
        }
        mINFORM(" module loaded");
        /* If we don't do this, then a first-time load will fail when opening
         * the hashmap.  Must be race between oprofile init and us.
         */
        strm = fopen("/proc/devices", "r");
        while(fgets(buf, BUFSZ, strm)) {
            int len;
            if (strstr(buf, "oprof")) {
                /* find the correct device major to use */
                len = sscanf(buf, "%d", &op_major);
                if (len!=1) {
                    ferr("couldn't scanf \"%s\" for op_major\n", buf);
                    fclose(strm);
                    return 1;
                }
                oprof_loaded = TRUE;
                break;
            }
        }
        fclose(strm);
    }
    else {
        mINFORM(" found \"oprof\" in /proc/devices");
    }

    /* Blow away and remake the device nodes to talk to oprofile. */
    mINFORM(" re-making oprofile device nodes...");
    result = mkdir(op_dir, 0);
    if (result && errno!=EEXIST) {
        mINFORM(" couldn't create %s, errno(%d): %s",
                op_dir, errno, strerror(errno));
        perr("Couldn't create %s", op_dir);
        return 1;
    }
    result = chmod(op_dir, 
            (S_IREAD|S_IWRITE|S_IEXEC|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH));
    if (result) {
        mINFORM(" couldn't chmod %s to 755, errno(%d): %s",
                op_dir, errno, strerror(errno));
        perr("Couldn't chmod %s to 755", op_dir);
        return 1;
    }

    /* create the device files */
    result = unlink(op_hashfile);
    result = mknod(op_hashfile, 
                   (S_IFCHR | S_IREAD|S_IWRITE|S_IEXEC), 
                   makedev(op_major, 1));
    if (result) {
        mINFORM(" couldn't mknod %s, errno(%d): %s",
                op_hashfile, errno, strerror(errno));
        perr("Couldn't mknod %s", op_hashfile);
        return 1;
    }
    result = unlink(op_devfile);
    result = mknod(op_devfile, 
                   (S_IFCHR | S_IREAD|S_IWRITE|S_IEXEC), 
                   makedev(op_major, 0));
    if (result) {
        mINFORM(" couldn't mknod %s, errno(%d): %s",
                op_devfile, errno, strerror(errno));
        perr("Couldn't mknod %s", op_devfile);
        return 1;
    }

    /* We may not end up using this one if oprofile is old. */
    result = unlink(op_notefile);
    result = mknod(op_notefile,
                   (S_IFCHR | S_IREAD|S_IWRITE|S_IEXEC),
                   makedev(op_major, 2));
    if (result) {
        mINFORM(" couldn't mknod %s, errno(%d): %s",
                op_notefile, errno, strerror(errno));
        perr("Couldn't mknod %s", op_notefile);
        return 1;
    }

    /* Check that there is noone else using the module.  Try to open the
     * hash map device.  This device can only be held open by one process
     * and does not start sampling when opened.
     */
    if ((gOp.hashfh=open(op_hashfile, O_RDONLY)) < 0) {
        mINFORM(" could not open %s errno(%d): %s", 
                op_hashfile, errno, strerror(errno));
        perr("Couldn't open %s errno(%d)", op_hashfile, errno);
        hint("Is another prospect or the oprofile daemon running?\n");
        return 1;
    }

    /* Try to open the note file.  If it fails with EINVAL, this
     * must be an old (OP6) oprofile module.
     */
    if ((gOp.notefh=open(op_notefile, O_RDONLY)) < 0) {
        mINFORM(" could not open %s errno(%d): %s",
                op_notefile, errno, strerror(errno));
        perr("Couldn't open %s errno(%d)", op_notefile, errno);
        if (errno == EINVAL) {
            ferr("pre-0.0.7 oprofile detected, please upgrade oprofile\n");
        } 
        return 1;
    } 
    /* Keep hash map open for us, this'll prevent others from initializing. */
    mINFORM(" device files created and %s opened.", op_hashfile);

    /* get hashmap size */
    if (!stat(op_hashsizefile, &sbuf)) {
        /* pre-0.5 interface, likelyhood of success later on is nil */
        gOp.hashsize = read_uint_from_proc(op_hashsizefile);
        ferr("warning, pre-0.5 oprofile detected, continuing anyway\n");
    }
    else
        gOp.hashsize = OP_HASH_MAP_SIZE;

    /* get buffer size now... */
    uval = read_uint_from_proc(op_bufsizefile);
    gOp.bufsize = sizeof(struct op_buffer_head) + 
                  (uval * sizeof(struct op_sample));
    mINFORM(" calculated buf size: %u", gOp.bufsize);
    
    /* ... and allocate space for the sample buffer */
    gOp.curbufhead = (struct op_buffer_head*) MALLOC(gOp.bufsize);
    if (gOp.curbufhead == NULL) {
        ferr("malloc fails for buffer size of: %u", gOp.bufsize);
        return 1;
    }
    mINFORM(" allocated %u for sample buffer at %p", gOp.bufsize, 
            gOp.curbufhead);

    /* get note buffer size... */
    uval = read_uint_from_proc(op_notesizefile);
    gOp.notesize = (uval) * sizeof(struct op_note);
    mINFORM(" calculated note buf size: %u", gOp.notesize);

    /* ... and allocate space for the note buffer */
    gOp.curnbuf = (struct op_note*) MALLOC(gOp.notesize);
    if (gOp.curnbuf == NULL) {
        ferr("malloc fails for buffer size of: %u", gOp.notesize);
        return 1;
    }
    mINFORM(" allocated %u for note buffer at %p", gOp.notesize, gOp.curnbuf);
 
    /* get system speed */
    strm = fopen("/proc/cpuinfo", "r");
    if (strm!=NULL) {
        while (fgets(buf, BUFSZ, strm)) {
            if (strstr(buf, "cpu MHz")) {
                char *ctmp;
                int len;
                ctmp = strrchr(buf, ':');
                if (!ctmp) continue;
                len=sscanf(ctmp+1,"%lf", &gConf.cpumhz);
                if (len!=1) gConf.cpumhz=500;
                continue;
            }
            if (strstr(buf, "processor")) {
                gConf.numcpus++;
            }
            if (gConf.cpumhz == 0) gConf.cpumhz = 500;
        }
        fclose(strm);
    }
    else {
        mINFORM(" couldn't open /proc/cpuinfo");
        gConf.cpumhz = 500;
    }
    mINFORM(" cpu mhz = %lf", gConf.cpumhz);
    mINFORM(" numcpus = %u", gConf.numcpus);
    /* allocate space for kernel profiles now we know numcpus */
    alloc_k_prof(); 

    /* init the auto flush rate if not overridden */
    if (gOp.flushrate<0) gOp.flushrate=200;
    
    /* read what oprofile thinks the cpu is */
    gConf.cputype = read_uint_from_proc(op_controldir "cpu_type");
    mINFORM(" read cpu_type=%d from proc", gConf.cputype);

    /* oprofile 0.0.9 requires a special case for RTC */
    if (stat(op_controldir "rtc_value", &sbuf) == 0) {
        /* file exists, must be 0.0.9 (or later) using RTC */
        gConf.cputype = CPU_RTC;
        mINFORM(" using RTC for sampling");
        gOp.p0.hz = read_uint_from_proc(op_controldir "rtc_value");
        if (gOp.samplehz) {
            mINFORM(" but sampling freq overridden to %lu Hz", gOp.samplehz);
            gOp.p0.hz = gOp.samplehz;
        }
        else if (gOp.p0.hz == 0) {
            gOp.p0.hz = gOp.samplehz = 256;
        }
        else {
            gOp.samplehz = gOp.p0.hz;
        }
        write_uint_to_proc(op_controldir "rtc_value", gOp.p0.hz);
        mINFORM(" sampling freq = %lu Hz", gOp.samplehz);
    }
    else {
        /* If perfcounters are all zero this means that this was a
         * fresh module load.  So, configure with defaults, otherwise, 
         * use what's there.
         */
        gOp.p0.count     = read_uint_from_proc(op_controldir "0/count");
        gOp.p0.enabled   = read_uint_from_proc(op_controldir "0/enabled");
        gOp.p0.event     = read_uint_from_proc(op_controldir "0/event");
        gOp.p0.kernel    = read_uint_from_proc(op_controldir "0/kernel");
        gOp.p0.unit_mask = read_uint_from_proc(op_controldir "0/unit_mask");
        gOp.p0.user      = read_uint_from_proc(op_controldir "0/user");
        
        gOp.p1.count     = read_uint_from_proc(op_controldir "1/count");
        gOp.p1.enabled   = read_uint_from_proc(op_controldir "1/enabled");
        gOp.p1.event     = read_uint_from_proc(op_controldir "1/event");
        gOp.p1.kernel    = read_uint_from_proc(op_controldir "1/kernel");
        gOp.p1.unit_mask = read_uint_from_proc(op_controldir "1/unit_mask");
        gOp.p1.user      = read_uint_from_proc(op_controldir "1/user");
    
        if (!gOp.p0.enabled && !gOp.p1.enabled) {
            mINFORM(" counters not enabled: setting defaults");
            if (gOp.samplehz) {
                mINFORM(" but sampling freq overridden to %lu Hz",gOp.samplehz);
                gOp.p0.count = (gConf.cpumhz*1000000.)/gOp.samplehz;
                gOp.p0.hz = gOp.samplehz;
            }
            else {
                gOp.p0.hz = gOp.samplehz = 200;
                gOp.p0.count = gConf.cpumhz*5000;
            }
            mINFORM(" counter 0 freq: %u Hz", gOp.p0.hz);
    
            if (gOp.p0.count == 0) 
                gOp.p0.count = 1;
            gOp.p0.enabled = 1;
/* REVISIT */
#ifdef __ia64__
            gOp.p0.event = 18;  /* CPU_CYCLES */
#else
            if (gConf.cputype <= CPU_PIII)
                gOp.p0.event = 0x79;  /* CPU_CLK_UNHALTED */
            else if (gConf.cputype == CPU_P4)
                gOp.p0.event = 0x1d;   /* GLOBAL_POWER_EVENTS */
            else if (gConf.cputype == CPU_ATHLON)
                /* Retried instructions is not the same as clock ticks!
                 * However, on Athlon, we do not have the equivalent of
                 * clock tics as the above.
                 */
                gOp.p0.event = 0xc0;  /* RETIRED_INSTRUCTIONS */ 
            else {
                ferr("unsupported cpu %d reported by oprofile", gConf.cputype);
                return 1;
            }
#endif
            gOp.p0.kernel = 1;
            gOp.p0.unit_mask = 1;
            gOp.p0.user = 1;
           
            mINFORM(" sampling freq = %lu Hz", gOp.samplehz);
            write_uint_to_proc(op_controldir "0/count", gOp.p0.count);
            write_uint_to_proc(op_controldir "0/enabled", gOp.p0.enabled);
            write_uint_to_proc(op_controldir "0/event", gOp.p0.event);
            write_uint_to_proc(op_controldir "0/kernel", gOp.p0.kernel);
            write_uint_to_proc(op_controldir "0/unit_mask", gOp.p0.unit_mask);
            write_uint_to_proc(op_controldir "0/user", gOp.p0.user);
        }
        else {
            mINFORM(" counters enabled, using current settings");
            if (gOp.p0.enabled) {
                if (gOp.samplehz) {
                    mINFORM(" but sampling freq overridden to %lu Hz", 
                            gOp.samplehz);
                    gOp.p0.count = (gConf.cpumhz*1000000.)/gOp.samplehz;
                    gOp.p0.hz = gOp.samplehz;
                    write_uint_to_proc(op_controldir "0/count", gOp.p0.count);
                }
                else {
                    gOp.p0.hz = (gConf.cpumhz * 1000000.)/gOp.p0.count;
                    gOp.samplehz = gOp.p0.hz;
                }
                mINFORM(" counter 0 freq: %u Hz", gOp.p0.hz);
            }
            if (gOp.p1.enabled) {
                gOp.p1.hz = (gConf.cpumhz * 1000000.)/gOp.p1.count;
                mINFORM(" counter 1 freq: %u Hz", gOp.p1.hz);
            }
        }
    }
    mINFORM(" successful op_init");
    return 0; 
} /* op_init() */

/*
 * int op_attach(void)
 *
 * Attach to by opening the device file and thus activate the oprofile 
 * sampling module.
 */
int 
op_attach(void)
{
    mINFORM("In op_attach()");
    
    /* memory map the hash table for path retrivals */
    gOp.hashmap = 
        mmap(0, OP_HASH_MAP_SIZE, PROT_READ, MAP_SHARED, gOp.hashfh, 0);
    if (gOp.hashmap == (void*)-1) {
        mINFORM(" couldn't mmap ophashmapdev, errno(%d): %s", 
                errno, strerror(errno));
        perr("Couldn't mmap hash map");
        return 1;
    }
    mINFORM(" mmapped hash: %p size: %u", gOp.hashmap, OP_HASH_MAP_SIZE);

    /* open the device file, this starts sample generation */
    if ((gOp.devfh=open(op_devfile, O_RDONLY)) < 0) {
        mINFORM(" could not open %s errno(%d): %s", 
                op_devfile, errno, strerror(errno));
        perr("Couldn't open %s", op_devfile);
        ferr("perhaps parameters are incorrect, check dmesg or /var/log/messages\n");
        return 1;
    }
    mINFORM(" opened %s", op_devfile);

    return 0;
} /* op_attach() */

/*
 * void op_shutdown(void)
 *
 * Stop sampling and disconnect from the oprofile sampling module.
 */
void 
op_shutdown(void)
{
    mINFORM("In op_shutdown()");

    /* Read back the sample counter value to get what the
     * count really is set to.  This is primarily for the laptop
     * patch which uses the RTC which can only be set to powers
     * of two.
     */
    if (gConf.cputype == CPU_RTC) {
        gOp.p0.hz = read_uint_from_proc(op_controldir "rtc_value");
    }
    else {
        gOp.p0.count = read_uint_from_proc(op_controldir "0/count");
        gOp.p0.hz = (gConf.cpumhz * 1000000.)/gOp.p0.count;
    }
    gOp.samplehz = gOp.p0.hz;
    mINFORM(" read back sampling freq: %lu", gOp.samplehz);

    /* close the oprofile device file */
    mINFORM(" closing devices...");
    if (gOp.devfh > 0) 
        close(gOp.devfh);

    /* and the note file (if opened) */
    if (gOp.notefh > 0)
        close(gOp.notefh);

    /* and the path hash map */
    if (gOp.hashfh > 0) {
        munmap(gOp.hashmap, OP_HASH_MAP_SIZE);
        close(gOp.hashfh);
    }

} /* op_shutdown() */

/*
 * void op_flush(void)
 *
 * Flush the the oprofile sampling module's buffer.
 */
void 
op_flush(void)
{
    FILE *flusher;

    mINFORM("In op_flush()");

    flusher=fopen(op_dumpfile, "w");
    if (!flusher) {
        mINFORM(" error opening %s, errno(%d): %s", 
                op_dumpfile, errno, strerror(errno));
        return;
    }

    fputs("1\n", flusher);
    fclose(flusher);
    gPstats[PRO_FLUSHES]++;
    return;

} /* op_flush() */

/*
 * void op_autoflush(unsigned int tenths)
 *
 * If the passed var has value, set up handler callback
 * via an alarm signal to flush the buffer in
 * ticks/100 second increments.  I.e. ticks=57 will 
 * flush every 0.57 seconds.
 *
 * If passed var is zero, turn off the alarm.
 */
void
op_autoflush(unsigned int ticks)
{
    struct itimerval itv;

    mINFORM("In op_autoflush(%u)", ticks);

    /* Set up interval timer to repeat at ticks/100 sec ticks/100 from 
     * now.  If ticks==0, this will disable the timer.
     */
    itv.it_interval.tv_usec = ticks*10000;
    itv.it_interval.tv_sec  = 0;
    itv.it_value.tv_usec    = ticks*10000;
    itv.it_value.tv_sec     = 0;

    setitimer(ITIMER_REAL, &itv, (struct itimerval *) 0);

} /* op_autoflush() */

/*
 * void sigalrm_handler(int)
 *
 * Alarm signal handler for autoflush.
 */
void 
sigalrm_handler(int sig)
{
    mINFORM("Caught ALRM signal");
    op_flush();

} /* sigalrm_handler() */

/*
 * int op_empty_buffer(void)
 *
 * Repeatedly flush and read the oprofile buffer till there's
 * either no data left or there is just a trickle so we can 
 * stop.
 *
 * Currently, just flush twice and call it good.  Make sure that we
 * read as many buffers as there are cpus so we don't lose data.
 */
int 
op_empty_buffer(void)
{
    int i, j;

    mINFORM("In op_empty_buffer()");

    if (gOp.readerror) return 0;
    for (i=0; i<2; i++) {
        op_flush();
        for (j=0; j<gConf.numcpus; j++) 
            op_read_buffer();
    }

    return 0;
} /* op_empty_buffer() */

/*
 * int op_read_buffer(void)
 *
 * Read and process the buffer.  Note that the read will
 * block and put us to sleep until a buffer is ready.
 * Good read/process returns 0, bad 1.
 */
int
op_read_buffer(void)
{
    ssize_t count, ncount, ii;

    mINFORM("In op_read_buffer()");

    /* read the samples buffer ... */
    count = op_read_device(gOp.devfh, gOp.curbufhead, gOp.bufsize);
    if (count < 0) return 1;
    gPstats[PRO_BUFFERS_READ]++;
    
    /* ... and save samples buffer to trace file */
    if (mTRACEOUT) {
        bfile_write(BBUF_SAMPLES, (char*) gOp.curbufhead, count);
    }

    /* read note buffer ... */
    ncount = op_read_device(gOp.notefh, gOp.curnbuf, gOp.notesize);
    if (ncount < 0) return 1;
    gPstats[PRO_NOTIFICATIONS]+=ncount/sizeof(struct op_note);
    
    /* ... and save notes buffer to trace file */
    if (mTRACEOUT) {
        bfile_write(BBUF_NOTES, (char *)gOp.curnbuf, ncount);
    }

    /* now, process the note buffer */
    for (ii=0; ii<ncount/sizeof(struct op_note); ii++) {
        switch (gOp.curnbuf[ii].type) {
            case OP_EXEC:
                op_exec( ((struct op_note*)(gOp.curnbuf))[ii].pid );
                /* and drop through to MAP */
                
            case OP_MAP:
                op_map(ii);
                break;

            case OP_FORK:         op_fork(ii);         break;
            case OP_EXIT:         op_exit(ii);         break;
            case OP_DROP_MODULES: op_drop_modules(ii); break;
            default:
                gPstats[PRO_UNKNOWN_NOTES]++;
                mINFORM(" unknown notification type %u", gOp.curnbuf[ii].type);
                if (gConf.bug_level)
                    ferr("unknown notification %u received\n", 
                         gOp.curnbuf[ii].type);
                break;
        }
    }

    /* now, process the sample buffer */
    if (count) {
        for (ii=0; ii<gOp.curbufhead->count; ii++) {
            /* This can happen on initial start when hash table
             * is being filled.  New to 0.2 oprofile. */
            if (gOp.curbufhead->buffer[ii].eip == 0) continue;

            op_process_sample(ii);
        }
    }

    return 0;
} /* op_read_buffer() */


/* --------------------- Static routines follow --------------------- */

/*
 * static int op_read_device(int fh, void *buf, size_t count)
 *
 * Reads the oprofile device file specified as the fh file handle.
 * If we are using a trace file as input, buffers are filled from there
 * instead.
 */
static ssize_t
op_read_device(int fh, void *buf, size_t size)
{
    double delta_s, told=0.0, tnew;
    ssize_t count;

    mINFORM("In op_read_device() ");

    if (mTRACEIN) return bfile_read_fromfile(buf, size);
    
    /* Normal operation is we block in the read of the oprofile samples file
     * until either the buffer becomes full and gets dumped such that the read
     * succeeds and fills the entire buffer, or, the read is blocked until the
     * interval timer sends us the alrm signal.  The call is then interrupted
     * and since our handler flushes the buffer, the subsequent call will get
     * all that is in the buffer.
     *
     * The note buffer read always returns unblocked.
     *
     * Note: If you don't lseek to 0, then the oprofile module will return an
     * EINVAL.  If the buffer size is incorrect, you get an EINVAL, without
     * blocking...  The notebuffer never blocks and will also return an EINVAL
     * if the size is off.
     */

    mTIME_MS(told);
    do {
        lseek(fh, 0, SEEK_SET);
        count = read(fh, buf, size);
        if (count<0 && errno!=EINTR) {
            if (errno == EINVAL) {
                /* fatal error reading device file */
                perr("Read of device file fails, %u bytes requested", size);
                mINFORM(" read of %u bytes from devfile bad, errno(%d): %s", 
                        gOp.bufsize, errno, strerror(errno));
                ferr(
"\n"
"Note: If you are using an oprofile that is not at version " CURRENT_OPROFILE "\n"
"      then the ABI has changed, please upgrade/downgrade oprofile.\n"
"      If this is a Red Hat 8.0 system, please use version 0.9.6a of prospect."
"\n");
            gOp.readerror++;
            return -1;
            }
        }
    } while (count<0);

    if (gDoGettimeofday) {
        tnew = get_time_ms();
        delta_s  = tnew - told;
        mINFORM(" read of opdev took: %f msec read: %u bytes", 
                delta_s, count);
    }
    else {
        mINFORM(" read of opdev: %u bytes", count);
    }

    return count;
} /* op_read_device() */

/*
 * static void op_process_sample(ssize_t idx)
 *
 * Process a sample in the trace.  At this point, the proc struct
 * should exist - either pre-exits or exec'ed.
 */
static void
op_process_sample(ssize_t idx)
{
    process_t *p;
    unsigned int counter, pid;
    unsigned long pc;

    counter = gOp.curbufhead->buffer[idx].counter;
    pid = gOp.curbufhead->buffer[idx].pid;
    pc = gOp.curbufhead->buffer[idx].eip;

    mINFORM("In op_process_sample(%u) pid=%u counter=%u", idx, pid, counter);
    gPstats[PRO_SAMPLES]++;
    gPstats[PRO_NUM_SAMPLE_TRACES]++;

    p = getproc_by_pid(pid);
    if (p==NULL && pc < gKernelVmOffset) {
        mBUG("Unknown pid(%u) for hit(%p) counter(%u) idx(%d)", 
             pid, pc, counter, idx);
        gPstats[PRO_UNKNOWN_HITS]++;
    }

    if (pc > gKernelVmOffset) {
        gPstats[PRO_SYS_HITS]++;
#ifdef __ia64__
        /* last region in ia64 land is kernel text */
        if (gConf.flags.do_kernel && 
                (pc < (unsigned long) gK.k_vas.vh_bck->rd_start))
            gPstats[PRO_MOD_HITS]++;
#else
        /* first region in ia32 land is kernel text */
        if (gConf.flags.do_kernel && 
                (pc > (unsigned long) gK.k_vas.vh_fwd->rd_end))
            gPstats[PRO_MOD_HITS]++;
#endif
        mINFORM("Kernel hit: pc=%p pid=%u counter=%u recnum=%d",
                pc, pid, counter, idx);
        /* figure out if this is a new kernel thread */
        if (p==NULL && pid) {
            mINFORM(" spontaneous birth of pid=%u", pid);
            p = alloc_proc(pid);
            p->pr_birthBy = cBIRTH_spontaneous;
            p->pr_isKthread = TRUE;  /* high probability */
            putproc_by_pid(pid, p);
        }
#ifdef __ia64__
        /* if hits to signal gate section, relocate them */
        if (pc < gOp.addr_gate_map_end) {
            if (pc > gOp.addr_gate_map_start) {
                pc = gOp.addr_start_gate + (pc - gOp.addr_gate_map_start);
                mINFORM(" gate hit, relocated PC to 0x%lx", pc);
                gPstats[PRO_GATE_HITS]++;
            }
        }
#endif
        profile_sys(p, pc, gOp.curbufhead->cpu_nr);
        profile_kernel(p, pc, gOp.curbufhead->cpu_nr);
        mPTREC("kern_hit: pid=%u cpu=%u pc=0x%lx counter=%u ",
               pid, gOp.curbufhead->cpu_nr, pc, counter); 
        if (gConf.flags.do_kernel)
            mPTREC("symbol=%s\n", kernel_symbol((void*)pc));
        else
            mPTREC("\n");

    }
    else {
        gPstats[PRO_USR_HITS]++;
        profile_user(p, pc, gOp.curbufhead->cpu_nr);
        mPTREC("user_hit: pid=%u cpu=%u name=%s pc=0x%lx counter=%u\n",
               pid, gOp.curbufhead->cpu_nr,
               p->pr_myKname == NULL ? "nill" : p->pr_myKname,
               pc, counter); 
    }
    
    if (pid == gConf.my_pid) gPstats[PRO_SELF_HITS]++;

} /* op_process_sample() */

/*
 * static void op_fork(ssize_t idx)
 *
 * Process fork in the trace.  Note that we can get a 
 * fork _after_ an exec.
 */
static void
op_fork(ssize_t idx)
{
    process_t *pold, *pnew;
    pid_t pid;
    unsigned long addr;
    int ii;

    pid = ((struct op_note*)(gOp.curnbuf))[idx].pid;
    addr = ((struct op_note*)(gOp.curnbuf))[idx].addr;

    mINFORM("In op_fork(%u):  new=%lu  old=%lu", idx, addr, pid);

    pold = getproc_by_pid(pid);
    if (!pold) {
        mINFORM(" fork from non-existent process");
        if (gConf.bug_level)
            ferr("fork(%lu) from non-existant process(%lu)\n", addr, pid);
    }

    pnew = getproc_by_pid(addr);
    /* process doesn't exist - normal operations */
    if (!pnew) {
        region_t *prd, *crd, *prevrd=NULL;
        mINFORM(" new does not exist, creating");
        pnew = alloc_proc(addr);
        pnew->pr_birthBy = cBIRTH_fork;
        if (pnew->pr_parent == NULL)
            pnew->pr_parent = pold;
        /* copy parent's regions over if parent exists */
        if (pnew->pr_parent) {
            prd = (region_t*) pnew->pr_parent->pr_vasHead->vh_fwd;
            ii = 0;
            while (prd && prd != (void*)pnew->pr_parent->pr_vasHead) {
                ii++;
                crd = CALLOC(sizeof(region_t), 1);
                if (!crd) {
                    mBUG("Calloc failure");
                    prospect_exit(1);
                }
                /* dup region */
                crd->rd_freed = prd->rd_freed;
                crd->rd_pid = addr;
                if (prd->rd_path) crd->rd_path = strdup(prd->rd_path);
                crd->rd_start = prd->rd_start;
                crd->rd_end = prd->rd_end;
                crd->rd_length = prd->rd_length;
                crd->rd_offset = prd->rd_offset;

                /* and link in region into vas */
                if (pnew->pr_vasHead->vh_fwd == NULL) { 
                    crd->rd_bck = (void*) (pnew->pr_vasHead);
                    pnew->pr_vasHead->vh_fwd = crd;
                    prevrd = crd;
                }
                else {
                    prevrd->rd_fwd = crd;
                    crd->rd_bck = prevrd;
                    prevrd = crd;
                }
                crd->rd_fwd = (void*)pnew->pr_vasHead;
                pnew->pr_vasHead->vh_bck = crd;

                /* move on to next region in parent */
                prd = prd->rd_fwd;
            }
            pnew->pr_vasHead->vh_entries = 
                pnew->pr_parent->pr_vasHead->vh_entries;
            mINFORM(" duplicated %d regions from parent(vh_entries=%d)", 
                    ii, pnew->pr_vasHead->vh_entries);
            if (gConf.bug_level) {
                region_t *r;
                int i;
                mINFORM(" parent pid %u, path %s", 
                        pold->pr_myPid, pold->pr_path);
                mINFORM(" region list:");
                mINFORM("    head: 0x%lx   0x%lx   0x%lx", 
                        (unsigned long) pnew->pr_vasHead,
                        (unsigned long) pnew->pr_vasHead->vh_fwd,
                        (unsigned long) pnew->pr_vasHead->vh_bck);
                r = pnew->pr_vasHead->vh_fwd;
                i = 0;
                do {
                    mINFORM("  %6d: 0x%lx   0x%lx   0x%lx", 
                            i, (unsigned long) r, (unsigned long) r->rd_fwd, 
                            (unsigned long) r->rd_bck);
                    r = r->rd_fwd;
                    i++;
                } while (r != (void*) pnew->pr_vasHead);
            } 
            /* if there's no name, create one */
            if (!pnew->pr_path && pnew->pr_parent->pr_path) 
                pnew->pr_path = strdup(pnew->pr_parent->pr_path);
            if (!pnew->pr_myKname) {
                char buf[256];
                sprintf(buf,"forked_%s", pnew->pr_parent->pr_myKname);
                pnew->pr_myKname = strdup(buf);
            }
        }
        /* link in new proc into glob list */
        putproc_by_pid(addr, pnew);

        /* write process struct out to file */
        if (!mTRACEIN) bfile_write_proc(pnew->pr_myPid);
    }
    /* otherwise, exec came in before fork - don't copy maps! */
    else {
        mINFORM(" new already exists, updating");
        /* re-read /proc to update */
        if (!mTRACEIN) read_proc_for_pid(pnew);
    }

    mPTREC("fork: pid=%u cpu=%u name=%s old_pid=%u ", 
            addr, gOp.curbufhead->cpu_nr,
            pnew->pr_myKname == NULL ? "nill" : pnew->pr_myKname, pid); 

    /* make sure pid's are propagated */
    if (pold) {
        pnew->pr_myParentPid = pold->pr_myPid;
        pnew->pr_myGroupPid = pold->pr_myGroupPid;
        pnew->pr_mySid = pold->pr_mySid;
        mPTREC("old_name=%s ", 
               pold->pr_myKname == NULL ? "nill" : pold->pr_myKname); 
    }

    mPTREC("\n");

    /* ait: check for kernel threads, how? */

    gPstats[PRO_FORKS]++;
} /* op_fork() */

/*
 * static void op_exec(pid_t pid)
 *
 * Process exec in the trace.  Maps will follow this 
 * trace.
 */
static void
op_exec(pid_t pid)
{
    process_t *p;

    mINFORM("In op_exec(pid=%u)", pid);
    gPstats[PRO_EXECS]++;

    p = getproc_by_pid(pid);
    mPTREC("exec: pid=%u cpu=%u ", pid, gOp.curbufhead->cpu_nr);

    /* If the process re-exec's itself, and we clear out the maps, then
     * the profile we have becomes invalid.  What to do?
     * Maybe the answer is to keep profiles in the regions - not the 
     * process.  For now, we just throw the old profile away, yuck!
     */
    if (p!=NULL) {
        region_t *r;
        unsigned int ii;

        mPTREC("name=%s ", p->pr_myKname == NULL ? "nill"  : p->pr_myKname); 
        p->pr_exec_times++;
        if (p->pr_exec_times > 1) {
            /* proc exec'ed itself */
            mINFORM(" re-exec of this pid, exec_times=%d", p->pr_exec_times);
            if (gConf.bug_level)
                ferr("process %u re-exec %u, will lose profiles up to now,"
                     " unfortunately\n",
                     pid, p->pr_exec_times);
            mPTREC("exec_times=%u ", p->pr_exec_times);
        }

        /* kill the region maps, regardless - none if fork created this */
        if (p->pr_vasHead->vh_entries) {
            region_t *rtofree;
            mINFORM(" process has %d maps", p->pr_vasHead->vh_entries);
            ii=0;
            r=p->pr_vasHead->vh_fwd;
            do {
                rtofree = r;
                r = r->rd_fwd;
                if (rtofree->rd_name) FREE(rtofree->rd_name);
                if (rtofree->rd_path) FREE(rtofree->rd_path);
                FREE(rtofree);
                ii++;
            } while (r != (void*) p->pr_vasHead);
            mINFORM(" cleared %d maps", ii);
            if (ii != p->pr_vasHead->vh_entries)
                mBUG("vh_entires(%u)!=maps(%u) in clear for re-exec of %u",
                      p->pr_vasHead->vh_entries, ii, p->pr_myPid);
            p->pr_vasHead->vh_fwd = NULL;
            p->pr_vasHead->vh_bck = NULL;
            p->pr_vasHead->vh_entries = 0;
        }
        /* clear out the (now) invalid profile - leak 
         * none if fork created this
         */
        p->pr_profile = NULL;
        p->pr_profTot = 0;
        p->pr_profUniq = 0;
        p->pr_sysProfile = NULL;
        p->pr_sysProfTot = 0;
        p->pr_sysProfUniq = 0;
        p->pr_birthBy = cBIRTH_exec;    /* reset reason since this is an exec */
        mPTREC("\n");
        return;
    }

    /* otherwise, allocate new process struct and link in */
    mINFORM(" new process, allocating and linking in");
    p = alloc_proc(pid);
    p->pr_birthBy = cBIRTH_exec;
    putproc_by_pid(pid, p);
    mPTREC("name=nill\n");

    /* watch for my child */
    if (pid == gConf.my_child.pid) gConf.my_child.p = p;

} /* op_exec() */

/*
 * static void op_exit(ssize_t idx)
 *
 * Process exit in the trace.
 * Not much to do except set endby flag.  Presuemably,
 * no further hits will take place in this proc.
 */
static void
op_exit(ssize_t idx)
{
    process_t *p;
    pid_t pid;

    pid = ((struct op_note*)(gOp.curnbuf))[idx].pid;
    mINFORM("In op_exit(%u) pid=%lu", idx, pid);
    gPstats[PRO_EXITS]++;
    p = getproc_by_pid(pid);
    if (p==NULL) {
        mINFORM("  exit for non-existant proc, pid=%u", pid);
        if (gConf.bug_level) 
            ferr("exit for non-existant process, pid=%u\n", pid);
        /* just ignore this */
        return;
    }
    p->pr_endBy = cEND_exit;
    mPTREC("exit: pid=%u cpu=%u name=%s\n", p->pr_myPid, gOp.curbufhead->cpu_nr,
            p->pr_myKname == NULL ? "nill" : p->pr_myKname); 
} /* op_exit() */

/*
 * static void op_map(ssize_t idx)
 *
 * Process mapping in the trace.  The map is 
 * inserted in ascending order. This makes the 
 * profile matching easier later on and doesn't 
 * add too much time to the list search.
 * We also don't check for duplicate maps.
 */
static void
op_map(ssize_t idx)
{
    unsigned int hash;
    region_t *r;
    process_t *p;
    static char path[MAXPATHLEN], *c;
    char *name;
    int goodpath;

    mINFORM("In op_map(%u) pid=%u", idx, gOp.curnbuf[idx].pid);

    /* alloc new region struct */
    r = CALLOC(sizeof(region_t), 1);
    if (!r) {
        mBUG("Calloc failure");
        prospect_exit(1);
    }

    /* extract mapping info from trace */
    r->rd_start    = (char*)gOp.curnbuf[idx].addr;
    r->rd_pid      = gOp.curnbuf[idx].pid;
    r->rd_length   = gOp.curnbuf[idx].len;
    r->rd_offset   = gOp.curnbuf[idx].offset;

    r->rd_end = (char*) ((unsigned long)r->rd_start + r->rd_length);

    p = getproc_by_pid(gOp.curnbuf[idx].pid);
    if (p==NULL) {
        mBUG("Got mapping for non-existant process,"
             " pid=%u, 0x%lx=0x%lx offset=0x%lx",
             r->rd_pid, r->rd_start, r->rd_end, r->rd_offset);
        FREE(r);
        return;
    }
   
    mPTREC("mmap: pid=%u cpu=%u name=%s ", p->pr_myPid, 
            gOp.curbufhead->cpu_nr, 
            p->pr_myKname == NULL ? "nill"  : p->pr_myKname); 
    mPTREC("start=0x%lx len=%lu offset=0x%lx ",
            r->rd_start, r->rd_length, r->rd_offset);
    mINFORM(" extracted values: start=%p pid=%u length=%lu offset=0x%lu",
            r->rd_start, r->rd_pid, r->rd_length, r->rd_offset);

    /* Now determine the path from the oprofile hashmap: build from
     * the end to the beginning where c points to the end of path
     * and gets decremented as we discover parent path elements in the
     * oprofile hash map.  If we're tracing in, then don't access the
     * hashmap since we've saved the file path in a FILE_PATH record.
     */ 
    if (!mTRACEIN) {
        mINFORM(" extracting path from hashmap");
        c = &path[MAXPATHLEN-1];
        *c = '\0';
        goodpath=FALSE;
        hash = gOp.curnbuf[idx].hash;
        while (hash) {
            if (hash == -1) {
                /* we don't have a path - possible deletion */
                mINFORM(" hash==-1: was the file deleted?");
                if (gConf.bug_level)
                    ferr("map for pid=%u 0x%lx:0x%lx has no path,"
                         " (deleted?), skipping\n",
                         r->rd_pid, r->rd_start, r->rd_end);
                break;
            }
            if (hash<0 || hash >= OP_HASH_LINE_NR) {
                /* error */
                mBUG("Hash value %u out of range: 0:%u", hash, OP_HASH_LINE_NR);
                break;
            }
            name = GET_STRING(gOp.hashmap, gOp.hashmap[hash].name);
            if (strlen(name)+strlen(c)+1 >= MAXPATHLEN) {
                mBUG("Path too long in hashmap, extra: [%s]  current: [%s]", 
                     gOp.hashmap[hash].name, path);
                break;
            }

            c -= strlen(name)+1;
            strncpy(c,"/",1);
            strncpy(c+1,name, strlen(name));

            hash = gOp.hashmap[hash].parent;
            if (hash==0) goodpath = TRUE;
        }

        if (goodpath) {
            mINFORM(" extracted: %s", c);
            mPTREC("path=%s ", c);
            r->rd_path = strdup(c);
            gPstats[PRO_MAPS]++;
        }
        else {
            mINFORM(" bad hashmap extraction, leaving path as null");
            r->rd_path = NULL;
        }

        if (mTRACEOUT) bfile_write_rpath(r);
    }

    /* link in new region into vas */
    if (p->pr_vasHead->vh_fwd == NULL) {          /* first region */
        r->rd_bck = (void*) (p->pr_vasHead);
        r->rd_fwd = (void*)p->pr_vasHead;
        p->pr_vasHead->vh_fwd = r;
        p->pr_vasHead->vh_bck = r;
    }
    else {
        region_t *tmp;
        /* set to first node */
        tmp = p->pr_vasHead->vh_fwd;
        /* if first node is not already greater */
        if (tmp->rd_start < r->rd_start) {
            /* look for insert point */
            while(tmp->rd_fwd != (void*) p->pr_vasHead) {
                tmp = tmp->rd_fwd;
                /* if this node has higher address, we're done */
                if (tmp->rd_start > r->rd_start)
                    break;
            }
        }
        /* check if we need to insert after tmp */
        if (tmp->rd_start < r->rd_start) {
            r->rd_bck = tmp;
            r->rd_fwd = tmp->rd_fwd;
            tmp->rd_fwd = r;
        }
        else {
            /* otherwise, insert in front of tmp */
            r->rd_fwd = tmp;
            r->rd_bck = tmp->rd_bck;
            /* in the middle */
            if (tmp->rd_bck != (void*) p->pr_vasHead) {
                tmp->rd_bck->rd_fwd = r;
            }
            else {
                p->pr_vasHead->vh_fwd = r;
            }
            tmp->rd_bck = r;
        }
    }

    p->pr_vasHead->vh_entries++;

    mINFORM(" linked in region %d into vas %p", p->pr_vasHead->vh_entries,
            p->pr_vasHead);

    /* The first map is the text region for the proc in IA32 land
     * so if we don't have  a path and/or a kpath, fill them in from
     * this map.  For IPF, the text is in quadrant 0x400.... and maps
     * from the earlier quadrants have shared libraries in them.
     * REVISIT: what if text gets mapped into a different region for ia64.
     */
    if (!mTRACEIN) {  
        /* if we're reading a trace file, paths will happen later */
#ifdef __ia64__
        if ((unsigned long)r->rd_start == 0x4000000000000000L) {
#else
        if (p->pr_vasHead->vh_entries == 1) {
#endif
            char s[32];

            if (r->rd_path) p->pr_path = strdup(r->rd_path);
            mINFORM(" duped path to process path: %s", p->pr_path);
            extract_basename(s, p->pr_path);
            p->pr_myKname = strdup(s);
            mINFORM(" duped base name for kname: %s", p->pr_myKname);
        }
    }
    mPTREC("\n");
} /* op_map() */

/*
 * static void op_drop_modules(ssize_t idx)
 *
 * Process drop modules notification in the trace.
 * ait: tbd
 */
static void
op_drop_modules(ssize_t idx)
{
    mINFORM("In op_drop_modules(%u)", idx);
    gPstats[PRO_DROP_MODULES]++;
} /* op_drop_modules() */

/*
 * static int run_insmod(void)
 *
 * Exec the insmod command to load the oprofile module.
 */
static int
run_insmod(char *modname)
{
    int ret=0;
    pid_t pid;
    char sys_call_str[128], sys_mmap2_str[128];

    mINFORM("Execing insmod...");

    if (!gOp.has_sys_call_table) {
        sprintf(sys_call_str, "addr_sys_call_table=0x%lx", 
            gOp.addr_sys_call_table);
        sprintf(sys_mmap2_str, "addr_sys_mmap2=0x%lx", gOp.addr_sys_mmap2);
        mINFORM(" address for sys_call_table: %s", sys_call_str);
        mINFORM(" address for sys_mmap2: %s", sys_mmap2_str);
    }
    
    /* create child */
    if ((pid=fork()) < 0) {
        perr("Insmod fork failed");
        return -1;
    }

    if (pid==0) {
        /* we're the child */
        if (gOp.has_sys_call_table) {
            execl("/sbin/insmod", "insmod", "-q",
                  modname, "allow_unload=1",
                  NULL);
        }
        else {
            execl("/sbin/insmod", "insmod",
                  modname, "allow_unload=1", "-q",
                  sys_call_str, sys_mmap2_str,
                  NULL);
        }

        /* not supposed to be here */
        perr("Gasp: couldn't exec /sbin/insmod");
        exit(1);
    }

    /* as a parent, we'll wait to avoid a zombie */
    if (waitpid(pid, &ret, 0) < 0) {
        perr("Waitpid error in insmod fork");
        return -1;
    }

    return ret;
} /* run_insmod() */

