麻瓜智能

深度学习完全攻略！（连载五：GPU加速技术指南）

本文以同步至公众号，欢迎订阅。

第四章 cm编译器

这一章，我们就用一个例子来说明cm的client和server时如何联系到一起，并最终运行的。以高斯模糊为例。此例也是intel提供的一个案例，但是没有说明怎么用。哈哈哈。

第一节建一个client的程序

假设文件名字为gauss_client.cpp

#include "cm_rt.h"
#include "common/bitmap_helpers.h"
#include "common/cm_rt_helpers.h"
#include "common/isa_helpers.h"

using cm::util::bitmap::BitMap;
// Defines the number of columns per thread.
#define NUM_COLS_PER_THREAD 8
// Defines the number of rows per thread.
#define NUM_ROWS_PER_THREAD 8

// Declares coefficients for gaussian filter.
float a0 = 0, a1 = 0, a2 = 0, a3 = 0, b1 = 0, b2 = 0, coefp = 0, coefn = 0;

// This function is used to computes coefficients for gaussian filter.
void CalculateCoefficients(float sigma, int order) {
    const float nsigma = sigma < 0.1f ? 0.1f : sigma;
    const float alpha = 1.695f / nsigma;
    const float ema = (float)exp(-alpha);
    const float ema2 = (float)exp(-2 * alpha);

    b1 = -2 * ema;
    b2 = ema2;

    switch (order) {
    case 0: {
        const float k = (1 - ema) * (1 - ema) / (1 + 2 * alpha * ema - ema2);
        a0 = k;
        a1 = k * (alpha - 1) * ema;
        a2 = k * (alpha + 1) * ema;
        a3 = -k * ema2;
    } break;

    case 1: {
        const float k = (1 - ema) * (1 - ema) / ema;
        a0 = k * ema;
        a1 = a3 = 0;
        a2 = -a0;
    } break;

    case 2: {
        const float
            ea = (float)exp(-alpha),
            k = -(ema2 - 1) / (2 * alpha * ema),
            kn = (-2 * (-1 + 3 * ea - 3 * ea * ea + ea * ea * ea) /
                       (3 * ea + 1 + 3 * ea * ea + ea * ea * ea));
        a0 = kn;
        a1 = -kn * (1 + k * alpha) * ema;
        a2 = kn * (1 - k * alpha) * ema;
        a3 = -kn * ema2;
    } break;

    default:
        fprintf(stderr, "gaussianFilter: invalid order parameter!\n");
        return;
    }
    coefp = (a0 + a1) / (1 + b1 + b2);
    coefn = (a2 + a3) / (1 + b1 + b2);
    printf("Coefficients are: \n");
    printf(" a0 = %f, a1 = %f, a2 = %f, a3 = %f, b1 = %f, b2 = %f\n", a0, a1, a2, a3, b1, b2);
}

int main(int argc, char *argv[]) {
    // Loads an input image named "lena.bmp".
    auto input_image = BitMap::load("lena.bmp");

    // Gets the width and height of the input image.
    unsigned int width = input_image.getWidth();
    unsigned int height = input_image.getHeight();
    printf("image width = %d, height = %d\n", width, height);

    // Checks the value of width, height and bpp(bits per pixel) of the image.
    // Only images in 8-bit RGB format are supported.
    // Only images with width and height a multiple of 8 are supported.
    if (width & 7 || height & 7 || input_image.getBPP() != 24) {
        std::cerr << "Error: Only images in 8-bit RGB format with width and "
                  << "height a multiple of 8 are supported.\n";
        std::exit(1);
    }

    // Copies input image to output except for the data.
    auto output_image = input_image;

    // Sets image size in bytes. There are a total of width*height pixels and
    // each pixel occupies (out.getBPP()/8) bytes.
    unsigned int img_size = width * height * output_image.getBPP() / 8;

    // Sets output to blank image.
    output_image.setData(new unsigned char[img_size]);

    // Allocates system memory for rgb_to_rgba to convert image format from
    // RGB to RGBA.
    // Allocates system memory for rgba_to_rgb to convert image format from
    // RGBA to RGB.
    unsigned int num_pixels = width * height;
    unsigned char *rgb_to_rgba = new unsigned char[num_pixels * 4];
    unsigned char *rgba_to_rgb = new unsigned char[num_pixels * 4];

    // Converts image format from RGB to RGBA.
    // Copies the RGB values from the image, set the 4th byte with zero.
    for (int i = 0; i < num_pixels; i++) {
        rgb_to_rgba[i * 4] = input_image.getData()[i * 3];
        rgb_to_rgba[i * 4 + 1] = input_image.getData()[i * 3 + 1];
        rgb_to_rgba[i * 4 + 2] = input_image.getData()[i * 3 + 2];
        rgb_to_rgba[i * 4 + 3] = 0;
    }

    // Computes coefficients for gaussian filter.
    float sigma = 10.0f;
    int order = 0;
    CalculateCoefficients(sigma, order);

    // Creates a CmDevice from scratch.
    // Param device: pointer to the CmDevice object.
    // Param version: CM API version supported by the runtime library.
    CmDevice *device = nullptr;
    unsigned int version = 0;
    cm_result_check(::CreateCmDevice(device, version));

    // The file gaussian_blur_test_genx.isa is generated when the kernels in the
    // file gaussian_blur_test_genx.cpp are compiled by the CM compiler.
    // Reads in the virtual ISA from "gaussian_blur_test_genx.isa" to the code
    // buffer.
    std::string isa_code = cm::util::isa::loadFile("gauss_genx.isa");
    if (isa_code.size() == 0) {
        std::cerr << "Error: empty ISA binary.\n";
        std::exit(1);
    }

    // Creates a CmProgram object consisting of the kernels loaded from the code
    // buffer.
    // Param isa_code.data(): Pointer to the code buffer containing the virtual
    // ISA.
    // Param isa_code.size(): Size in bytes of the code buffer containing the
    // virtual ISA.
    CmProgram *program = nullptr;
    cm_result_check(device->LoadProgram(const_cast(isa_code.data()),
                                        isa_code.size(),
                                        program));

    // For vertical direction.

    // Creates the kernel.
    // Param program: CM Program from which the kernel is created.
    // Param "gaussianVertical": The kernel name which should be no more than 256
    // bytes including the null terminator.
    CmKernel *kernel_vertical = nullptr;
    cm_result_check(device->CreateKernel(program,
                                         "gaussianVertical",
                                         kernel_vertical));

    // Creates input surface with given width and height in pixels and format.
    CmSurface2D *input_surface = nullptr;
    cm_result_check(device->CreateSurface2D(4 * width,
                                            height,
                                            CM_SURFACE_FORMAT_A8,
                                            input_surface));

    // Copies system memory content to the input surface using the CPU. The
    // system memory content is the data of the input image in RGBA format.
    // The size of data copied is the size of data in the rgb_to_rgba.
    cm_result_check(input_surface->WriteSurface(rgb_to_rgba, nullptr));

    // Creates the temp surface. The width, height and format is the same as
    // the input surface.
    // The temp surface contains the output of kernel_vertical.
    CmSurface2D *temp_surface = nullptr;
    cm_result_check(device->CreateSurface2D(4 * width,
                                            height,
                                            CM_SURFACE_FORMAT_A8,
                                            temp_surface));

    // When a surface is created by the CmDevice a SurfaceIndex object is
    // created. This object contains a unique index value that is mapped to the
    // surface.
    // Gets the input surface index.
    SurfaceIndex *input_surface_idx = nullptr;
    cm_result_check(input_surface->GetIndex(input_surface_idx));

    // Gets the temp surface index.
    SurfaceIndex *temp_surface_idx = nullptr;
    cm_result_check(temp_surface->GetIndex(temp_surface_idx));

    // Sets a per kernel argument.
    // Sets input surface index as the first argument of kernel_vertical.
    // Sets temp surface index as the second argument of kernel_vertical.
    cm_result_check(kernel_vertical->SetKernelArg(0,
                                                  sizeof(SurfaceIndex),
                                                  input_surface_idx));
    cm_result_check(kernel_vertical->SetKernelArg(1,
                                                  sizeof(SurfaceIndex),
                                                  temp_surface_idx));

    // Sets the image width and height as the third and the fourth argument
    // of kernel_vertical.
    cm_result_check(kernel_vertical->SetKernelArg(2, 4, &width));
    cm_result_check(kernel_vertical->SetKernelArg(3, 4, &height));

    // Sets filter coefficients as the rest arguments of kernel_vertical.
    cm_result_check(kernel_vertical->SetKernelArg(4, 4, &a0));
    cm_result_check(kernel_vertical->SetKernelArg(5, 4, &a1));
    cm_result_check(kernel_vertical->SetKernelArg(6, 4, &a2));
    cm_result_check(kernel_vertical->SetKernelArg(7, 4, &a3));
    cm_result_check(kernel_vertical->SetKernelArg(8, 4, &b1));
    cm_result_check(kernel_vertical->SetKernelArg(9, 4, &b2));
    cm_result_check(kernel_vertical->SetKernelArg(10, 4, &coefp));
    cm_result_check(kernel_vertical->SetKernelArg(11, 4, &coefn));

    // Each CmKernel can be executed by multiple concurrent threads.
    // Here, for "kernel_vertical" kernel, each thread works on
    // NUM_COLS_PER_THREAD columns in vertical direction.
    int thread_width = width / NUM_COLS_PER_THREAD;

    // Creates a CmThreadSpace object.
    // There are two usage models for the thread space. One is to define the
    // dependency between threads to run in the GPU. The other is to define a
    // thread space where each thread can get a pair of coordinates during
    // kernel execution. For this example, we use the latter usage model.
    CmThreadSpace *thread_space = nullptr;
    cm_result_check(device->CreateThreadSpace(thread_width,
                                              1,
                                              thread_space));

    // Creates a task queue.
    // The CmQueue is an in-order queue. Tasks get executed according to the
    // order they are enqueued. The next task does not start execution until the
    // current task finishes.
    CmQueue *cmd_queue = nullptr;
    cm_result_check(device->CreateQueue(cmd_queue));

    // Creates a CmTask object.
    // The CmTask object is a container for CmKernel pointers. It is used to
    // enqueue the kernels for execution.
    CmTask *task = nullptr;
    cm_result_check(device->CreateTask(task));

    // Adds a CmKernel pointer to CmTask.
    // This task has one kernel.
    cm_result_check(task->AddKernel(kernel_vertical));

    // Launches the task on the GPU. Enqueue is a non-blocking call, i.e. the
    // function returns immediately without waiting for the GPU to start or
    // finish execution of the task. The runtime will query the HW status. If
    // the hardware is not busy, the runtime will submit the task to the
    // driver/HW; otherwise, the runtime will submit the task to the driver/HW
    // at another time.
    // An event, "sync_event", is created to track the status of the task.
    CmEvent *sync_event = nullptr;
    cm_result_check(cmd_queue->Enqueue(task,
                                       sync_event,
                                       thread_space));

    // Destroys a CmTask object.
    // CmTask will be destroyed when CmDevice is destroyed.
    // Here, the application destroys the CmTask object by itself.
    cm_result_check(device->DestroyTask(task));

    // For horizontal direction.

    // Creates the kernel.
    // Param program: CM Program from which the kernel is created.
    // Param "gaussianHorizontal": The kernel name which should be no more than
    // 256 bytes including the null terminator.
    CmKernel *kernel_horizontal = nullptr;
    cm_result_check(device->CreateKernel(program,
                                         "gaussianHorizontal",
                                         kernel_horizontal));

    // Creates the output surface. The width, height and format is the same as
    // the input surface.
    // The output surface contains the output of kernel_horizontal.
    CmSurface2D *output_surface = nullptr;
    cm_result_check(device->CreateSurface2D(4 * width,
                                            height,
                                            CM_SURFACE_FORMAT_A8,
                                            output_surface));

    // Gets the output surface index.
    SurfaceIndex *output_surface_idx = nullptr;
    cm_result_check(output_surface->GetIndex(output_surface_idx));

    // Sets a per kernel argument.
    // Sets the output of kernel_vertical as the input of kernel_horizontal.
    // Sets temp surface index as the first argument of kernel_horizontal.
    // Sets output surface index as the second argument of kernel_horizontal.
    cm_result_check(kernel_horizontal->SetKernelArg(0,
                                                    sizeof(SurfaceIndex),
                                                    temp_surface_idx));
    cm_result_check(kernel_horizontal->SetKernelArg(1,
                                                    sizeof(SurfaceIndex),
                                                    output_surface_idx));

    // Sets the image width and height as the third and the fourth argument
    // of kernel_horizontal.
    cm_result_check(kernel_horizontal->SetKernelArg(2, 4, &width));
    cm_result_check(kernel_horizontal->SetKernelArg(3, 4, &height));

    // Sets filter coefficients as the rest arguments of kernel_horizontal.
    cm_result_check(kernel_horizontal->SetKernelArg(4, 4, &a0));
    cm_result_check(kernel_horizontal->SetKernelArg(5, 4, &a1));
    cm_result_check(kernel_horizontal->SetKernelArg(6, 4, &a2));
    cm_result_check(kernel_horizontal->SetKernelArg(7, 4, &a3));
    cm_result_check(kernel_horizontal->SetKernelArg(8, 4, &b1));
    cm_result_check(kernel_horizontal->SetKernelArg(9, 4, &b2));
    cm_result_check(kernel_horizontal->SetKernelArg(10, 4, &coefp));
    cm_result_check(kernel_horizontal->SetKernelArg(11, 4, &coefn));

    // Each CmKernel can be executed by multiple concurrent threads.
    // Here, for "kernel_horizontal" kernel, each thread works on
    // NUM_ROWS_PER_THREAD rows in horizontal direction.
    int thread_height = height / NUM_ROWS_PER_THREAD;

    // Creates a CmThreadSpace object.
    // There are two usage models for the thread space. One is to define the
    // dependency between threads to run in the GPU. The other is to define a
    // thread space where each thread can get a pair of coordinates during
    // kernel execution. For this example, we use the latter usage model.
    cm_result_check(device->CreateThreadSpace(thread_height,
                                              1,
                                              thread_space));

    // Creates a CmTask object.
    // The CmTask object is a container for CmKernel pointers. It is used to
    // enqueue the kernels for execution.
    cm_result_check(device->CreateTask(task));

    // Adds a CmKernel pointer to CmTask.
    // This task has one kernels.
    cm_result_check(task->AddKernel(kernel_horizontal));

    // Launches the task on the GPU. Enqueue is a non-blocking call, i.e. the
    // function returns immediately without waiting for the GPU to start or
    // finish execution of the task. The runtime will query the HW status. If
    // the hardware is not busy, the runtime will submit the task to the
    // driver/HW; otherwise, the runtime will submit the task to the driver/HW
    // at another time.
    // An event, "sync_event", is created to track the status of the task.
    cm_result_check(cmd_queue->Enqueue(task,
                                       sync_event,
                                       thread_space));

    // Destroys a CmTask object.
    // CmTask will be destroyed when CmDevice is destroyed.
    // Here, the application destroys the CmTask object by itself.
    cm_result_check(device->DestroyTask(task));

    // Reads the output surface content to the system memory using the CPU.
    // The size of data copied is the size of data in Surface.
    // It is a blocking call. The function will not return until the copy
    // operation is completed.
    // The dependent event "sync_event" ensures that the reading of the surface
    // will not happen until its state becomes CM_STATUS_FINISHED.
    cm_result_check(output_surface->ReadSurface(rgba_to_rgb,
                                                sync_event));

    // Destroys the CmDevice.
    // Also destroys surfaces, kernels, tasks, thread spaces, and queues that
    // were created using this device instance that have not explicitly been
    // destroyed by calling the respective destroy functions.
    cm_result_check(::DestroyCmDevice(device));

    // Converts image format from RGBA to RGB.
    unsigned char *tmp = new unsigned char[num_pixels * 3];
    for (int i = 0; i < num_pixels; i++) {
        tmp[i * 3] = rgba_to_rgb[i * 4];
        tmp[i * 3 + 1] = rgba_to_rgb[i * 4 + 1];
        tmp[i * 3 + 2] = rgba_to_rgb[i * 4 + 2];
    }
    output_image.setData(tmp);

    // Saves the output image data into the file "blur_out.bmp".
    output_image.save("blur_out.bmp");

    // Frees memory.
    delete[] rgb_to_rgba;
    delete[] rgba_to_rgb;

    // Checks result.
    if (BitMap::checkResult("blur_out.bmp",
                            "blur_gold.bmp",
                            5)) {
        std::cout << "PASSED" << std::endl;
        return 0;
    } else {
        std::cout << "FAILED" << std::endl;
        return -1;
    }
}

第二节建一个server的程序

假设文件名字为gauss_genx.cpp

#include 
#define NUM_COMPONENTS 4
// number of rows we read in at once
#define NUM_ROWS_PER_ITER 8
// number of columns per thread
#define NUM_COLS_PER_THREAD 8
#define SIMD_SIZE (NUM_COLS_PER_THREAD * NUM_COMPONENTS)

#define CLAMP_TO_EDGE 1

// for horizontal direction
// number of rows per thread
#define NUM_ROWS_PER_THREAD 8
// number of columns we read in at once
#define NUM_COLS_PER_ITER 8

// Each thread processes 32 columns independently
// For now assume height is divisible by 8
extern "C" _GENX_MAIN_ void
gaussianVertical( SurfaceIndex INBUF, SurfaceIndex OUTBUF, int width, int height, float a0, float a1, float a2, float a3, float b1, float b2, float coefp, float coefn )
{

  matrix image;
  matrix outImage;
  vector in;
  vector out;
  vector inMinusOne;
  vector outMinusOne;
  vector outMinusTwo;

  uint id = get_thread_origin_x();

#if CLAMP_TO_EDGE
  matrix firstRow;
  read( INBUF, id * SIMD_SIZE, 0, firstRow);
  inMinusOne = firstRow;
  inMinusOne *= 1/255.0f;
  outMinusTwo = coefp * inMinusOne;
  outMinusOne = outMinusTwo;
#else
  inMinusOne = 0;
  outMinusOne = 0;
  outMinusTwo = 0;
#endif

  //read in 8 rows at a time
  for( int i = 0; i < height; i += NUM_ROWS_PER_ITER ) {
    read( INBUF, id * SIMD_SIZE, i, image );

    #pragma unroll
    for( unsigned j = 0; j < NUM_ROWS_PER_ITER; j++ ) {
      in = image.row(j);
      in *= 1/255.0f;
      //out = a0 * in + a1 * inMinusOne - b1 * outMinusOne - b2 * outMinusTwo;
      out = a0 * in + a1 * inMinusOne - (b1 * outMinusOne + b2 * outMinusTwo);
      inMinusOne = in;
      outMinusTwo = outMinusOne;
      outMinusOne = out;

      //clamp the value to [0,1]
      out = cm_add(out, 0.0f, SAT);
      outImage.row(j) = out * 255.0f;
    }

    //write back to surface
    write( OUTBUF, id*SIMD_SIZE, i, outImage );
  }

  vector inPlusOne;
  vector inPlusTwo;
  vector outPlusOne;
  vector outPlusTwo;
  vector temp;

#if CLAMP_TO_EDGE
  matrix lastRow;
  read( INBUF, id * SIMD_SIZE, height - 1, lastRow );
  inPlusOne = lastRow;
  inPlusOne *= 1/255.0f;
  inPlusTwo = inPlusOne;
  outPlusOne = coefn * inPlusOne;
  outPlusTwo = outPlusOne;
#else
  inPlusOne = 0;
  inPlusTwo = 0;
  outPlusOne = 0;
  outPlusTwo = 0;
#endif

  //read 8 rows at a time, in reverse direction
  for( int i = height - NUM_ROWS_PER_ITER; i >= 0; i -= NUM_ROWS_PER_ITER ) {
    read( INBUF, id * SIMD_SIZE, i, image );
    read( MODIFIED(OUTBUF), id * SIMD_SIZE, i, outImage );

    #pragma unroll
    for( int j = NUM_ROWS_PER_ITER - 1; j >= 0; j-- ) {
      in = image.row(j);
      in *= 1 / 255.0f;
      //temp = a2 * inPlusOne + a3 * inPlusTwo - b1 * outPlusOne - b2 * outPlusTwo;
      temp = a2 * inPlusOne + a3 * inPlusTwo - (b1 * outPlusOne + b2 * outPlusTwo);
      inPlusTwo = inPlusOne;
      inPlusOne = in;
      outPlusTwo = outPlusOne;
      outPlusOne = temp;

      out = outImage.row(j);
      out = cm_add( out * (1/255.0f), temp, SAT );
      outImage.row(j) = out * 255;
    }

    //write back to surface
    write( OUTBUF, id*SIMD_SIZE, i, outImage );
  }
}

extern "C" _GENX_MAIN_ void
transpose( SurfaceIndex INBUF, SurfaceIndex OUTBUF, unsigned id, int width, int height ) {

  matrix in;
  matrix out;

  for( int i = 0; i < height; i += 8 ) {
    read( INBUF, id * 32, i, in );
    out.row(0) = in.column(0);
    out.row(1) = in.column(1);
    out.row(2) = in.column(2);
    out.row(3) = in.column(3);
    out.row(4) = in.column(4);
    out.row(5) = in.column(5);
    out.row(6) = in.column(6);
    out.row(7) = in.column(7);

    write( OUTBUF, i * 4, id * 8, out );
  }
}

// Like gaussianVertical, except we process 8 independent rows at once
extern "C" _GENX_MAIN_ void
gaussianHorizontal( SurfaceIndex INBUF, SurfaceIndex OUTBUF, int width, int height, float a0, float a1, float a2, float a3, float b1, float b2, float coefp, float coefn )
{

  matrix image;
  matrix outImage;
  matrix in;
  matrix out;
  matrix inMinusOne;
  matrix outMinusOne;
  matrix outMinusTwo;

  uint id = get_thread_origin_x();

#if CLAMP_TO_EDGE
  matrix firstColumn;
  read( MODIFIED(INBUF), 0, id * NUM_ROWS_PER_THREAD, firstColumn );
  inMinusOne = firstColumn;
  inMinusOne *= 1/255.0f;
  outMinusTwo = coefp * inMinusOne;
  outMinusOne = outMinusTwo;
#else
  inMinusOne = 0;
  outMinusOne = 0;
  outMinusTwo = 0;
#endif

  //read 8 rows at a time
  for( int i = 0; i < width; i += NUM_COLS_PER_ITER ) {
    read( MODIFIED(INBUF), i * NUM_COMPONENTS, id * NUM_ROWS_PER_THREAD, image );

    #pragma unroll
    for( unsigned j = 0; j < NUM_COLS_PER_ITER; j++ ) {

      in = image.select(0, j * NUM_COMPONENTS);
      in *= 1/255.0f;
      //out = a0 * in + a1 * inMinusOne - b1 * outMinusOne - b2 * outMinusTwo;
      out = a0 * in + a1 * inMinusOne - (b1 * outMinusOne + b2 * outMinusTwo);
      inMinusOne = in;
      outMinusTwo = outMinusOne;
      outMinusOne = out;

      //clamp the value to [0,1]
      out = cm_add( out, 0.0f, SAT ) * 255.0f;
      outImage.select(0, j*NUM_COMPONENTS) = out;
    }

    //write back to surface
    write( OUTBUF, i * NUM_COMPONENTS, id * NUM_ROWS_PER_THREAD, outImage );
  }

  //reverse direction
  matrix inPlusOne;
  matrix inPlusTwo;
  matrix outPlusOne;
  matrix outPlusTwo;
  matrix temp;

#if CLAMP_TO_EDGE
  matrix lastColumn;
  read( MODIFIED(INBUF), width - NUM_COMPONENTS, id * 8, lastColumn );
  inPlusOne = lastColumn;
  inPlusOne *= 1/255.0f;
  inPlusTwo = inPlusOne;
  outPlusOne = coefn * inPlusOne;
  outPlusTwo = outPlusOne;
#else
  inPlusOne = 0;
  inPlusTwo = 0;
  outPlusOne = 0;
  outPlusTwo = 0;
#endif

  for( int i = width - NUM_COLS_PER_ITER; i >= 0; i -= NUM_COLS_PER_ITER ) {
    read( MODIFIED(INBUF), i * NUM_COMPONENTS, id * NUM_ROWS_PER_THREAD, image );
    read( MODIFIED(OUTBUF), i * NUM_COMPONENTS, id * NUM_ROWS_PER_THREAD, outImage );

    #pragma unroll
    for( int j = NUM_COLS_PER_ITER - 1; j >= 0; j-- ) {
      in = image.select(0, j*NUM_COMPONENTS);
      in *= 1/255.0f;
      //temp = a2 * inPlusOne + a3 * inPlusTwo - b1 * outPlusOne - b2 * outPlusTwo;
      temp = a2 * inPlusOne + a3 * inPlusTwo - (b1 * outPlusOne + b2 * outPlusTwo);
      inPlusTwo = inPlusOne;
      inPlusOne = in;
      outPlusTwo = outPlusOne;
      outPlusOne = temp;

      //The mul * 1 forces out to not be coalesced with outImage, so we can use SIMD16
      //operations instead of SIMD4
      out = outImage.select(0, j*NUM_COMPONENTS) * 1.0f;
      //out = outImage.select(0, j*NUM_COMPONENTS);

      out = cm_add( out * (1/255.0f), temp, SAT );
      outImage.select(0, j*NUM_COMPONENTS) = out * 255.0f;
    }

    write( OUTBUF, i * NUM_COMPONENTS, id * 8, outImage );
  }
}

第三节最重要的编译

在第一节中，有一行代码时核心，

std::string isa_code = cm::util::isa::loadFile("gauss_genx.isa");

那么这个gauss_genx.isa 是哪来的呢？用NVIDIA显卡做过深度学习的同学肯定知道ISA总线这个名词，本文不写这些。主要是告诉您，怎么由gauss_genx.cpp变为gauss_genx.isa.

首先，你得找到cm的编译器。

（1）从https://github.com/intel/cm-compiler这个地方clone代码

（2）安装：VS2015（或以上版本），安装python2.7，安装cygwin,安装cmake,安装unzip,安装curl

（3）打开cygwin,进入cm-compiler里面执行下面的代码：

cmake path/to/llvm/source/root

cmake --build .

（4）执行support/scripts/build.bash -s vs2015 -d -m --64

（5）找到.exe，应该是在build.64.vs2015文件夹里面。

（6）在cmd中执行

cmc.exe -isystem path\to\cm-compiler\support\include path\to\gauss_genx.cpp -march=SKL

（7）可以看到，在cmc.exe所在文件夹生成了gauss_genx.isa文件

（8）我们打开isa文件看一下，实际上是一些16进制的数据。再底层是怎么加载实现的，本文就不深入探讨了。

好了，写到这里，基本上把CM有关的一些基本知识都介绍了。跟CUDA很类似，CM同样可以集成到深度学习框架当中，作为框架的kernel部分。

下一个更新，则主要集中在深度学习的应用上。那么为什么之前没有写CUDA呢。按照我的思路，先了解Intel的东西，然后在应用层面上，以大家用的最多的为主。

所以，后面将会带你从零开始，在笔记本上搭建一个基于深度学习的目标检测模型。此模型能够实现将训练好的模型，用C++进行集成，并作为最终的开发文件一起发布。

拭目以待吧。

给普通人看的深度学习说明书：用快递系统理解AI如何思考嵌入式Jerry Python AI 人工智能深度学习
第一章：理解AI的思维方式（快递版）1.1快递分拣站的故事假设你管理一个快递分拣站：传统方法：手动制定规则（比如根据邮编分拣）机器学习：观察老员工的分拣记录，总结规律深度学习：搭建自动分拣流水线，自主发现隐藏规则1.2神经网络就像智能分拣机传送带（输入层）：接收包裹信息（图片像素/文字等）#就像扫描快递单input_data=[0.2,0.7,0.1]#归一化后的特征数据分拣工人（隐藏层）：每个工
解析大模型归一化：提升训练稳定性和性能的关键技术秋声studio 口语化解析深度学习人工智能大模型归一化
引言在深度学习领域，特别是在处理大型神经网络模型时，归一化（Normalization）是一项至关重要的技术。它可以提高模型的训练稳定性和性能，在加速收敛方面发挥了重要作用。本文将深入探讨大模型归一化的原理、常见方法及其应用场景，并结合实际案例和代码示例进行说明。一、归一化的作用与理论基础归一化的主要目的是为了提高模型的训练稳定性和性能。具体来说，归一化有以下几个关键作用：提高训练稳定性：在神经网
深入解析深度学习中的过拟合与欠拟合诊断、解决与工程实践古月居GYH 深度学习人工智能
一、引言：模型泛化能力的核心挑战在深度学习模型开发中，欠拟合与过拟合是影响泛化能力的两个核心矛盾。据GoogleBrain研究统计，工业级深度学习项目中有63%的失败案例与这两个问题直接相关。本文将从基础概念到工程实践，系统解析其本质特征、诊断方法及解决方案，并辅以可复现的代码案例。二、核心概念与通熟易懂解释简单而言，欠拟合是指模型不能在训练集上获得足够低的误差。换句换说，就是模型复杂度低，模型在
Umi-OCR 实践教程：离线、免费、高效的图像文字识别工具几道之旅人工智能智能体及数字员工 ocr 人工智能
一、工具简介Umi-OCR是一款开源、免费且支持离线运行的OCR（光学字符识别）工具，适用于Windows和Linux系统。它基于深度学习技术，能够高效提取图像中的文字，支持多语言识别、批量处理、截屏识别等功能，尤其适合对隐私敏感或网络受限的场景。核心亮点：离线运行：无需联网，保护隐私。多引擎支持：提供Paddle（高性能）和Rapid（低配兼容）两种引擎。批量处理：支持图片、PDF、电子书等多格
基于ChatGPT、GIS与Python机器学习的地质灾害风险评估、易发性分析、信息化建库及灾后重建高级实践 weixin_贾防洪评价风险评估滑坡泥石流地质灾害
第一章、ChatGPT、DeepSeek大语言模型提示词与地质灾害基础及平台介绍【基础实践篇】1、什么是大模型？大模型（LargeLanguageModel,LLM）是一种基于深度学习技术的大规模自然语言处理模型。代表性大模型：GPT-4、BERT、T5、ChatGPT等。特点：多任务能力：可以完成文本生成、分类、翻译、问答等任务。上下文理解：能理解复杂的上下文信息。广泛适配性：适合科研、教育、行
anythingLLM 使用教程惟贤箬溪穷玩Ai AIGC 人工智能
一、anythingLLM简介anythingLLM是一款灵活且功能强大的语言模型，它基于先进的深度学习架构构建，旨在为用户提供多样化的自然语言处理服务。其设计理念注重通用性和可扩展性，能够适应多种领域和任务，无论是文本生成、智能问答，还是翻译、摘要提取等，都能展现出出色的性能。与同类模型相比，anythingLLM具有训练数据丰富、模型优化程度高的优势，能够生成更符合逻辑、更具实用性的文本内容。
深度解析大模型推理框架：原理、应用与实践百度_开发者中心人工智能大模型自然语言处理
在当今数据驱动的时代，大模型推理框架已经成为人工智能领域的重要支柱。本文将通过简明扼要、清晰易懂的方式，带领读者深入了解大模型推理框架的原理、应用领域和实践经验，帮助读者更好地掌握这一技术，并在实际工作中发挥其价值。一、大模型推理框架简介大模型推理框架是指一种基于深度学习技术的推理框架，主要用于解决大规模数据集下的复杂问题。该框架通过对海量数据进行高效的训练和推理，能够快速地对各种复杂场景进行分析
大模型推理框架：从理论到实践的全面解析百度_开发者中心人工智能大模型自然语言处理
在数据驱动的时代，深度学习技术已经渗透到各个行业，从图像识别到自然语言处理，从推荐系统到智能客服，其应用无处不在。然而，深度学习模型的训练和推理过程往往涉及大量数据和复杂计算，传统的计算框架难以满足需求。因此，大模型推理框架应运而生，成为解决这一问题的关键。一、大模型推理框架基本概念大模型推理框架是一种基于深度学习技术的推理框架，它通过对海量数据进行高效的训练和推理，能够快速地对各种复杂场景进行分
Yolo系列之Yolo的基本理解是十一月末 YOLO python 开发语言 yolo
YOLO的基本理解目录YOLO的基本理解1YOLO1.1概念1.2算法2单、多阶段对比2.1FLOPs和FPS2.2one-stage单阶段2.3two-stage两阶段1YOLO1.1概念YOLO(YouOnlyLookOnce)是一种基于深度学习的目标检测算法，由JosephRedmon等人于2016年提出。它的核心思想是将目标检测问题转化为一个回归问题，通过一个神经网络直接预测目标的类别和位
大语言模型学习路线：从入门到实战大模型官方资料语言模型学习人工智能产品经理自然语言处理搜索引擎
大语言模型学习路线：从入门到实战在人工智能领域，大语言模型（LargeLanguageModels,LLMs）正迅速成为一个热点话题。本学习路线旨在为有基本Python编程和深度学习基础的学习者提供一个清晰、系统的大模型学习指南，帮助你在这一领域快速成长。本学习路线更新至2024年02月，后期部分内容或工具可能需要更新。适应人群已掌握Python基础具备基本的深度学习知识学习步骤本路线将通过四个核
深度学习与目标检测系列(六) 本文约(4.5万字) | 全面解读复现ResNet | Pytorch | 小酒馆燃着灯深度学习目标检测 pytorch 人工智能 ResNet 残差连接残差网络
文章目录解读Abstract—摘要翻译精读主要内容Introduction—介绍翻译精读背景RelatedWork—相关工作ResidualRepresentations—残差表达翻译精读主要内容ShortcutConnections—短路连接翻译精读主要内容DeepResidualLearning—深度残差学习ResidualLearning—残差学习翻译精读ResNet目的以前方法本文改进本质
深度学习与目标检测系列(三) 本文约(4万字) | 全面解读复现AlexNet | Pytorch | 小酒馆燃着灯深度学习目标检测 pytorch AlexNet 人工智能
文章目录解读Abstract-摘要翻译精读主要内容1.Introduction—前言翻译精读主要内容：本文主要贡献：2.TheDataset-数据集翻译精读主要内容：ImageNet简介：图像处理方法：3.TheArchitecture—网络结构3.1ReLUNonlinearity—非线性激活函数ReLU翻译精读传统方法及不足本文改进方法本文的改进结果3.2TrainingonMultipleG
计算机视觉技术探索：美颜SDK如何利用深度学习优化美颜、滤镜功能？美狐美颜sdk 美颜SDK 美颜API 直播美颜SDK 计算机视觉深度学习直播美颜SDK 美颜sdk 第三方美颜sdk 美颜api
时下，计算机视觉+深度学习正在重塑美颜技术，通过智能人脸检测、AI滤镜、深度美肤、实时优化等方式，让美颜效果更加自然、精准、个性化。那么，美颜SDK如何结合深度学习来优化美颜和滤镜功能？本文将深入解析AI在美颜技术中的应用，并探讨其未来发展趋势。一、深度学习如何赋能美颜SDK？1.AI人脸检测与关键点识别：精准捕捉五官在美颜过程中，首先需要精准检测人脸位置和五官特征点，确保美颜效果不会失真。深度学
深度学习模型性能全景评估与优化指南 niuTaylor 深度学习人工智能
深度学习模型性能全景评估与优化指南一、算力性能指标体系1.核心算力指标对比指标计算方式适用场景硬件限制TOPS(TeraOperationsPerSecond)每秒万亿次整数运算量化模型推理NVIDIAJetsonNano仅支持FP16/FP32TFLOPS(TeraFLoating-pointOPerationsperSecond)TFLOPS=Cores×FLOPs/Cycle×Frequen
利用Python和深度学习方法实现手写数字识别的高精度解决方案——从数据预处理到模型优化的全流程解析快撑死的鱼 Python算法精解 python 深度学习开发语言
利用Python和深度学习方法实现手写数字识别的高精度解决方案——从数据预处理到模型优化的全流程解析在人工智能的众多应用领域中，手写数字识别是一项经典且具有重要实际应用价值的任务。随着深度学习技术的飞速发展，通过构建和训练神经网络模型，手写数字识别的精度已经可以达到99%以上。本文将以Python为主要编程语言，结合深度学习的核心技术，详细解析手写数字识别的实现过程，并探讨如何进一步优化模型以提高
强化学习中的深度卷积神经网络设计与应用实例数字扫地僧计算机视觉 cnn 人工智能神经网络
I.引言强化学习（ReinforcementLearning，RL）是机器学习的一个重要分支，通过与环境的交互来学习最优策略。深度学习，特别是深度卷积神经网络（DeepConvolutionalNeuralNetworks，DCNNs）的引入，为强化学习在处理高维度数据方面提供了强大工具。本文将探讨强化学习中深度卷积神经网络的设计原则及其在不同应用场景中的实例。II.深度卷积神经网络在强化学习中的
腾讯云大模型知识引擎与DeepSeek：打造懒人专属的谷歌浏览器翻译插件大富大贵7 程序员知识储备1 程序员知识储备2 程序员知识储备3 腾讯云云计算
摘要：随着人工智能技术的飞速发展，越来越多的前沿技术和工具已走入日常生活。翻译工具作为跨语言沟通的桥梁，一直处于技术创新的风口浪尖。本文探讨了腾讯云大模型知识引擎与DeepSeek结合谷歌浏览器插件的可能性，旨在为用户提供一种便捷、高效的翻译体验。通过应用深度学习、自然语言处理和知识图谱技术，该插件不仅能实时翻译网页内容，还能根据上下文进行智能推荐，实现精准的语境转换。本文将详细阐述其设计思路、技
PyTorch深度学习框架60天进阶学习计划 - 第28天：多模态模型实践（二）凡人的AI工具箱深度学习 pytorch 学习 AI编程人工智能 python
PyTorch深度学习框架60天进阶学习计划-第28天：多模态模型实践（二）5.跨模态检索系统应用场景5.1图文匹配系统的实际应用应用领域具体场景优势电子商务商品图像搜索、视觉购物用户可以上传图片查找相似商品或使用文本描述查找商品智能媒体内容推荐、图片库搜索通过内容的语义理解提供更精准的推荐和搜索社交网络基于内容的帖子推荐理解用户兴趣，提供更相关的内容推荐教育技术多模态教学资源检索教师和学生可以更
PyTorch深度学习框架60天进阶学习计划 - 第28天：多模态模型实践（一）凡人的AI工具箱深度学习 pytorch 学习 AI编程人工智能 python
PyTorch深度学习框架60天进阶学习计划-第28天：多模态模型实践（一）引言：跨越感知的边界欢迎来到我们的PyTorch学习旅程第28天！今天我们将步入AI世界中最激动人心的领域之一：多模态学习。想象一下，如果你的模型既能"看"又能"读"，并且能够理解图像与文字之间的联系，这将为我们打开怎样的可能性？今天我们将专注于构建图文匹配系统，学习如何使用CLIP（ContrastiveLanguage
10.2 如何解决从复杂 PDF 文件中提取数据的问题？墨染辉大语言模型 pdf
10.2如何解决从复杂PDF文件中提取数据的问题？解决方案：嵌入式表格检索解释：嵌入式表格检索是一种专门针对从复杂PDF文件中的表格提取数据的技术。它结合了表格识别、解析和语义理解，使得从复杂结构的表格中检索信息成为可能。具体步骤：表格检测和识别：目标：在PDF页面中准确地定位和识别表格区域。方法：使用计算机视觉和深度学习技术，如卷积神经网络（CNN）或其他先进的图像处理算法。效果：能够检测出页面
TensorFlow深度学习实战项目：从入门到精通点我头像干啥 Ai 深度学习 tensorflow 人工智能
引言深度学习作为人工智能领域的一个重要分支，近年来取得了显著的进展。TensorFlow作为Google开源的深度学习框架，因其强大的功能和灵活的架构，成为了众多开发者和研究者的首选工具。本文将带领大家通过一个实战项目，深入理解TensorFlow的使用方法，并掌握深度学习的基本流程。1.TensorFlow简介1.1TensorFlow是什么？TensorFlow是一个开源的机器学习框架，由Go
国外7个最佳大语言模型 (LLM) API推荐幂简集成 API新理念语言模型人工智能自然语言处理
大型语言模型(LLM)API将彻底改变我们处理语言的方式。在深度学习和机器学习算法的支持下，LLMAPI提供了前所未有的自然语言理解能力。通过利用这些新的API，开发人员现在可以创建能够以前所未有的方式理解和响应书面文本的应用程序。下面，我们将比较从Bard到ChatGPT、PaLM等市场上顶级LLMAPI。我们还将探讨整合这些LLM的潜在用例，并考虑其对语言处理的影响。什么是大语言模型(LLM)
【深度学习】DeepSeek模型介绍与部署 Nerous_ 深度学习深度学习人工智能
原文链接：DeepSeek-V31.介绍DeepSeek-V3，一个强大的混合专家(MoE)语言模型，拥有671B总参数，其中每个token激活37B参数。为了实现高效推理和成本效益的训练，DeepSeek-V3采用了多头潜在注意力(MLA)和DeepSeekMoE架构，这些架构在DeepSeek-V2中得到了充分验证。此外，DeepSeek-V3首次提出了无辅助损失的负载平衡策略，并设置了多to
【深度学习】 PyTorch一文详解 Nerous_ 深度学习深度学习 pytorch 人工智能机器学习 python
“PyTorchisadeeplearningframeworkthatprioritizessimplicityandflexibility,makingitthego-tochoiceforbothresearchersanddevelopers.”—Anonymous1.PyTorch简介1.1PyTorch的背景与发展PyTorch是由Facebook人工智能研究院（FAIR）开发的一个开
【DNN量化工具】QKeras 工具简介 kanhao100 笔记 dnn 人工智能神经网络
QKeras工具简介QKeras是一个用于量化深度学习模型的Keras扩展库，旨在使深度学习模型的量化（即将模型的浮点权重转换为低精度格式）变得简单而高效。QKeras主要目标是优化模型的存储和推理速度，特别适用于需要在资源受限的设备（如移动设备和嵌入式系统）上运行深度学习模型的场景。QKeras的主要特点量化支持：QKeras提供了对不同类型量化的支持，包括权重量化和激活量化。用户可以根据需求选
Softmax温度调节与注意力缩放：深度神经网络中的平滑艺术 Mark White dnn 人工智能神经网络
Softmax温度调节与注意力缩放：深度神经网络中的平滑艺术在深度学习的精密机械中，有些细微的调整机制往往被视为理所当然，却实际上蕴含着深刻的数学洞察和巧妙的工程智慧。今天，我们将探讨两个看似独立却本质相通的机制：生成模型中的温度参数与Transformer注意力机制中的缩放因子。这两个设计都围绕着同一个核心概念——softmax分布的平滑控制。Softmax函数：概率分布的催化剂在深入讨论之前，
QKeras、Brevitas和QONNX量化工具对比 kanhao100 笔记深度学习边缘计算
QKeras、Brevitas和QONNX量化工具对比一、引言在深度学习模型部署领域，量化技术已成为提升模型执行效率的关键手段。通过将浮点权重转换为低精度表示，量化能显著减小模型体积、降低内存占用并加速推理过程。对于资源受限的设备（如移动设备、嵌入式系统和边缘计算设备），量化技术尤为重要。本文深入对比三款主流量化工具：QKeras、Brevitas和QONNX，从用户实际应用角度剖析它们的技术特点
Umi-OCR：解锁高效文字识别的新时代水熠芝Dark-Haired
Umi-OCR：解锁高效文字识别的新时代Umi-OCR一款强大而高效的文字识别工具项目地址:https://gitcode.com/Resource-Bundle-Collection/6adda项目介绍在数字化浪潮席卷全球的今天，文字识别技术已成为提升工作效率和生活质量的关键工具。Umi-OCR，作为一款基于深度学习技术的开源文字识别工具，凭借其强大的功能和高效的性能，迅速成为众多用户的首选。无
Umi-OCR：一款强大而高效的文字识别工具裘心国Trent
Umi-OCR：一款强大而高效的文字识别工具Umi-OCR一款强大而高效的文字识别工具项目地址:https://gitcode.com/Resource-Bundle-Collection/6adda介绍Umi-OCR是一款基于深度学习技术的开源文字识别工具，特别适合日常办公、学术研究及数据分析等场景。它能有效解决将图像中的文字快速转化为可编辑文本的需求，极大提升工作效率。此工具依托于先进的计算机
自动语音识别（ASR）：技术、应用与未来 ajie1117 语音识别人工智能
自动语音识别（ASR）：技术、应用与未来1.ASR简介自动语音识别（ASR，AutomaticSpeechRecognition）是一种将语音转换为文本的技术。它利用人工智能（AI）、深度学习和自然语言处理（NLP）技术来识别和理解人类的语言，使计算机能够与人类进行更自然的交互。2.ASR的工作原理ASR的核心流程通常包括以下几个步骤：语音信号采集：通过麦克风或其他设备获取音频数据。预处理：去除噪
数据采集高并发的架构应用 3golden .net
问题的出发点：最近公司为了发展需要，要扩大对用户的信息采集，每个用户的采集量估计约2W。如果用户量增加的话，将会大量照成采集量成3W倍的增长，但是又要满足日常业务需要，特别是指令要及时得到响应的频率次数远大于预期。 &n
不停止 MySQL 服务增加从库的两种方式 brotherlamp linux linux视频 linux资料 linux教程 linux自学
现在生产环境MySQL数据库是一主一从，由于业务量访问不断增大，故再增加一台从库。前提是不能影响线上业务使用，也就是说不能重启MySQL服务，为了避免出现其他情况，选择在网站访问量低峰期时间段操作。一般在线增加从库有两种方式，一种是通过mysqldump备份主库，恢复到从库，mysqldump是逻辑备份，数据量大时，备份速度会很慢，锁表的时间也会很长。另一种是通过xtrabacku
Quartz——SimpleTrigger触发器 eksliang SimpleTrigger TriggerUtils quartz
转载请出自出处：http://eksliang.iteye.com/blog/2208166 一.概述 SimpleTrigger触发器，当且仅需触发一次或者以固定时间间隔周期触发执行；二.SimpleTrigger的构造函数 SimpleTrigger(String name, String group)：通过该构造函数指定Trigger所属组和名称； Simpl
Informatica应用（1） 18289753290 sql workflow lookup 组件 Informatica
1.如果要在workflow中调用shell脚本有一个command组件，在里面设置shell的路径；调度wf可以右键出现schedule，现在用的是HP的tidal调度wf的执行。 2.designer里面的router类似于SSIS中的broadcast（多播组件）;Reset_Workflow_Var：参数重置（比如说我这个参数初始是1在workflow跑得过程中变成了3我要在结束时还要
python 获取图片验证码中文字酷的飞上天空 python
根据现成的开源项目 http://code.google.com/p/pytesser/改写在window上用easy_install安装不上看了下源码发现代码很少于是就想自己改写一下添加支持网络图片的直接解析 #coding:utf-8 #import sys #reload(sys) #sys.s
AJAX 永夜-极光 Ajax
1.AJAX功能:动态更新页面,减少流量消耗,减轻服务器负担 2.代码结构: <html> <head> <script type="text/javascript"> function loadXMLDoc() { .... AJAX script goes here ...
创业OR读研随便小屋创业
现在研一，有种想创业的想法，不知道该不该去实施。因为对于的我情况这两者是矛盾的，可能就是鱼与熊掌不能兼得。研一的生活刚刚过去两个月，我们学校主要的是
需求做得好与坏直接关系着程序员生活质量 aijuans IT 生活
这个故事还得从去年换工作的事情说起，由于自己不太喜欢第一家公司的环境我选择了换一份工作。去年九月份我入职现在的这家公司，专门从事金融业内软件的开发。十一月份我们整个项目组前往北京做现场开发，从此苦逼的日子开始了。系统背景：五月份就有同事前往甲方了解需求一直到6月份，后续几个月也完
如何定义和区分高级软件开发工程师 aoyouzi
在软件开发领域，高级开发工程师通常是指那些编写代码超过 3 年的人。这些人可能会被放到领导的位置，但经常会产生非常糟糕的结果。Matt Briggs 是一名高级开发工程师兼 Scrum 管理员。他认为，单纯使用年限来划分开发人员存在问题，两个同样具有 10 年开发经验的开发人员可能大不相同。近日，他发表了一篇博文，根据开发者所能发挥的作用划分软件开发工程师的成长阶段。　　初
Servlet的请求与响应百合不是茶 servlet get提交 java处理post提交
Servlet是tomcat中的一个重要组成,也是负责客户端和服务端的中介 1,Http的请求方式(get ,post); 客户端的请求一般都会都是Servlet来接受的,在接收之前怎么来确定是那种方式提交的,以及如何反馈,Servlet中有相应的方法, http的get方式 servlet就是都doGet(
web.xml配置详解之listener bijian1013 java web.xml listener
一.定义 <listener> <listen-class>com.myapp.MyListener</listen-class> </listener> 二.作用该元素用来注册一个监听器类。可以收到事件什么时候发生以及用什么作为响
Web页面性能优化（yahoo技术） Bill_chen JavaScript Ajax Web css Yahoo
1.尽可能的减少HTTP请求数 content 2.使用CDN server 3.添加Expires头(或者 Cache-control) server 4.Gzip 组件 server 5.把CSS样式放在页面的上方。 css 6.将脚本放在底部(包括内联的) javascript 7.避免在CSS中使用Expressions css 8.将javascript和css独立成外部文
【MongoDB学习笔记八】MongoDB游标、分页查询、查询结果排序 bit1129 mongodb
游标游标，简单的说就是一个查询结果的指针。游标作为数据库的一个对象，使用它是包括声明打开循环抓去一定数目的文档直到结果集中的所有文档已经抓取完关闭游标游标的基本用法，类似于JDBC的ResultSet(hasNext判断是否抓去完,next移动游标到下一条文档)，在获取一个文档集时，可以提供一个类似JDBC的FetchSize
ORA-12514 TNS 监听程序当前无法识别连接描述符中请求服务的解决方法白糖_ ORA-12514
今天通过Oracle SQL*Plus连接远端服务器的时候提示“监听程序当前无法识别连接描述符中请求服务”，遂在网上找到了解决方案： ①打开Oracle服务器安装目录\NETWORK\ADMIN\listener.ora文件，你会看到如下信息： # listener.ora Network Configuration File: D:\database\Oracle\net
Eclipse 问题 A resource exists with a different case bozch eclipse
在使用Eclipse进行开发的时候，出现了如下的问题： Description Resource Path Location TypeThe project was not built due to "A resource exists with a different case: '/SeenTaoImp_zhV2/bin/seentao'.&
编程之美-小飞的电梯调度算法 bylijinnan 编程之美
public class AptElevator { /** * 编程之美小飞电梯调度算法 * 在繁忙的时间，每次电梯从一层往上走时，我们只允许电梯停在其中的某一层。 * 所有乘客都从一楼上电梯，到达某层楼后，电梯听下来，所有乘客再从这里爬楼梯到自己的目的层。 * 在一楼时，每个乘客选择自己的目的层，电梯则自动计算出应停的楼层。 * 问：电梯停在哪
SQL注入相关概念 chenbowen00 sql Web 安全
SQL Injection：就是通过把SQL命令插入到Web表单递交或输入域名或页面请求的查询字符串，最终达到欺骗服务器执行恶意的SQL命令。具体来说，它是利用现有应用程序，将（恶意）的SQL命令注入到后台数据库引擎执行的能力，它可以通过在Web表单中输入（恶意）SQL语句得到一个存在安全漏洞的网站上的数据库，而不是按照设计者意图去执行SQL语句。首先让我们了解什么时候可能发生SQ
[光与电]光子信号战防御原理 comsci 原理
无论是在战场上,还是在后方,敌人都有可能用光子信号对人体进行控制和攻击,那么采取什么样的防御方法,最简单,最有效呢? 我们这里有几个山寨的办法,可能有些作用,大家如果有兴趣可以去实验一下根据光
oracle 11g新特性:Pending Statistics daizj oracle dbms_stats
oracle 11g新特性:Pending Statistics 转从11g开始，表与索引的统计信息收集完毕后，可以选择收集的统信息立即发布，也可以选择使新收集的统计信息处于pending状态，待确定处于pending状态的统计信息是安全的，再使处于pending状态的统计信息发布，这样就会避免一些因为收集统计信息立即发布而导致SQL执行计划走错的灾难。在 11g 之前的版本中，D
快速理解RequireJs dengkane jquery requirejs
RequireJs已经流行很久了，我们在项目中也打算使用它。它提供了以下功能：声明不同js文件之间的依赖可以按需、并行、延时载入js库可以让我们的代码以模块化的方式组织初看起来并不复杂。在html中引入requirejs 在HTML中，添加这样的 <script> 标签： <script src="/path/to
C语言学习四流程控制if条件选择、for循环和强制类型转换 dcj3sjt126com c
# include <stdio.h> int main(void) { int i, j; scanf("%d %d", &i, &j); if (i > j) printf("i大于j\n"); else printf("i小于j\n"); retu
dictionary的使用要注意 dcj3sjt126com IO
NSDictionary *dict = [NSDictionary dictionaryWithObjectsAndKeys: user.user_id , @"id", user.username , @"username",
Android 中的资源访问(Resource) finally_m xml android String drawable color
简单的说，Android中的资源是指非代码部分。例如，在我们的Android程序中要使用一些图片来设置界面，要使用一些音频文件来设置铃声，要使用一些动画来显示特效，要使用一些字符串来显示提示信息。那么，这些图片、音频、动画和字符串等叫做Android中的资源文件。在Eclipse创建的工程中，我们可以看到res和assets两个文件夹，是用来保存资源文件的，在assets中保存的一般是原生
Spring使用Cache、整合Ehcache 234390216 spring cache ehcache @Cacheable
Spring使用Cache 从3.1开始，Spring引入了对Cache的支持。其使用方法和原理都类似于Spring对事务管理的支持。Spring Cache是作用在方法上的，其核心思想是这样的：当我们在调用一个缓存方法时会把该方法参数和返回结果作为一个键值对存放在缓存中，等到下次利用同样的
当druid遇上oracle blob(clob) jackyrong oracle
http://blog.csdn.net/renfufei/article/details/44887371 众所周知，Oracle有很多坑, 所以才有了去IOE。在使用Druid做数据库连接池后，其实偶尔也会碰到小坑，这就是使用开源项目所必须去填平的。【如果使用不开源的产品，那就不是坑，而是陷阱了，你都不知道怎么去填坑】用Druid连接池，通过JDBC往Oracle数据库的
easyui datagrid pagination获得分页页码、总页数等信息 ldzyz007
var grid = $('#datagrid'); var options = grid.datagrid('getPager').data("pagination").options; var curr = options.pageNumber; var total = options.total; var max =
浅析awk里的数组 nigelzeng 二维数组 array 数组 awk
awk绝对是文本处理中的神器，它本身也是一门编程语言，还有许多功能本人没有使用到。这篇文章就单单针对awk里的数组来进行讨论，如何利用数组来帮助完成文本分析。有这么一组数据： abcd,91#31#2012-12-31 11:24:00 case_a,136#19#2012-12-31 11:24:00 case_a,136#23#2012-12-31 1
搭建 CentOS 6 服务器(6) - TigerVNC rensanning centos
安装GNOME桌面环境 # yum groupinstall "X Window System" "Desktop" 安装TigerVNC # yum -y install tigervnc-server tigervnc 启动VNC服务 # /etc/init.d/vncserver restart # vncser
Spring 数据库连接整理 tomcat_oracle spring bean jdbc
1、数据库连接jdbc.properties配置详解　　jdbc.url=jdbc:hsqldb:hsql://localhost/xdb 　　jdbc.username=sa 　　jdbc.password= 　　jdbc.driver=不同的数据库厂商驱动，此处不一一列举　　接下来，详细配置代码如下：　　 Spring连接池
Dom4J解析使用xpath java.lang.NoClassDefFoundError: org/jaxen/JaxenException异常 xp9802
用Dom4J解析xml,以前没注意,今天使用dom4j包解析xml时在xpath使用处报错异常栈：java.lang.NoClassDefFoundError: org/jaxen/JaxenException异常导入包 jaxen-1.1-beta-6.jar 解决; &nb

深度学习完全攻略！（连载五：GPU加速技术指南）

你可能感兴趣的:(深度学习)