tesseract源码Page Layout解读(倾斜矫正)

http://blog.csdn.net/kaelsass/article/details/46874627

http://www.jianshu.com/p/7c63fd62ea28


代码调用



代码附录

Tesseract::SegmentPage[ccmain/pagesegmain.cpp] -> 

/**
 * Segment the page according to the current value of tessedit_pageseg_mode.
 * pix_binary_ is used as the source image and should not be NULL.
 * On return the blocks list owns all the constructed page layout.
 */
int Tesseract::SegmentPage(const STRING* input_file, BLOCK_LIST* blocks,
                           Tesseract* osd_tess, OSResults* osr) {
  ASSERT_HOST(pix_binary_ != NULL);
  int width = pixGetWidth(pix_binary_);
  int height = pixGetHeight(pix_binary_);
  // Get page segmentation mode.
  PageSegMode pageseg_mode = static_cast(
      static_cast(tessedit_pageseg_mode));
  // If a UNLV zone file can be found, use that instead of segmentation.
  if (!PSM_COL_FIND_ENABLED(pageseg_mode) &&
      input_file != NULL && input_file->length() > 0) {
    STRING name = *input_file;
    const char* lastdot = strrchr(name.string(), '.');
    if (lastdot != NULL)
      name[lastdot - name.string()] = '\0';
    read_unlv_file(name, width, height, blocks);
  }
  if (blocks->empty()) {
    // No UNLV file present. Work according to the PageSegMode.
    // First make a single block covering the whole image.
    BLOCK_IT block_it(blocks);
    BLOCK* block = new BLOCK("", TRUE, 0, 0, 0, 0, width, height);
    block->set_right_to_left(right_to_left());
    block_it.add_to_end(block);
  } else {
    // UNLV file present. Use PSM_SINGLE_BLOCK.
    pageseg_mode = PSM_SINGLE_BLOCK;
  }
  // The diacritic_blobs holds noise blobs that may be diacritics. They
  // are separated out on areas of the image that seem noisy and short-circuit
  // the layout process, going straight from the initial partition creation
  // right through to after word segmentation, where they are added to the
  // rej_cblobs list of the most appropriate word. From there classification
  // will determine whether they are used.
  BLOBNBOX_LIST diacritic_blobs;
  int auto_page_seg_ret_val = 0;
  TO_BLOCK_LIST to_blocks;
  if (PSM_OSD_ENABLED(pageseg_mode) || PSM_BLOCK_FIND_ENABLED(pageseg_mode) ||
      PSM_SPARSE(pageseg_mode)) {
    auto_page_seg_ret_val = AutoPageSeg(
        pageseg_mode, blocks, &to_blocks,
        enable_noise_removal ? &diacritic_blobs : NULL, osd_tess, osr);
    if (pageseg_mode == PSM_OSD_ONLY)
      return auto_page_seg_ret_val;
    // To create blobs from the image region bounds uncomment this line:
    //  to_blocks.clear();  // Uncomment to go back to the old mode.
  } else {
    deskew_ = FCOORD(1.0f, 0.0f);
    reskew_ = FCOORD(1.0f, 0.0f);
    if (pageseg_mode == PSM_CIRCLE_WORD) {
      Pix* pixcleaned = RemoveEnclosingCircle(pix_binary_);
      if (pixcleaned != NULL) {
        pixDestroy(&pix_binary_);
        pix_binary_ = pixcleaned;
      }
    }
  }

  if (auto_page_seg_ret_val < 0) {
    return -1;
  }

  if (blocks->empty()) {
    if (textord_debug_tabfind)
      tprintf("Empty page\n");
    return 0;  // AutoPageSeg found an empty page.
  }
  bool splitting =
      pageseg_devanagari_split_strategy != ShiroRekhaSplitter::NO_SPLIT;
  bool cjk_mode = textord_use_cjk_fp_model;

  textord_.TextordPage(pageseg_mode, reskew_, width, height, pix_binary_,
                       pix_thresholds_, pix_grey_, splitting || cjk_mode,
                       &diacritic_blobs, blocks, &to_blocks);
  return auto_page_seg_ret_val;
}


Tesseract::SetupPageSegAndDetectOrientation[ccmain/pagesegmain.cpp] -> 

 * Sets up auto page segmentation, determines the orientation, and corrects it.
 * Somewhat arbitrary chunk of functionality, factored out of AutoPageSeg to
 * facilitate testing.
 * photo_mask_pix is a pointer to a NULL pointer that will be filled on return
 * with the leptonica photo mask, which must be pixDestroyed by the caller.
 * to_blocks is an empty list that will be filled with (usually a single)
 * block that is used during layout analysis. This ugly API is required
 * because of the possibility of a unlv zone file.
 * TODO(rays) clean this up.
 * See AutoPageSeg for other arguments.
 * The returned ColumnFinder must be deleted after use.
 */
ColumnFinder* Tesseract::SetupPageSegAndDetectOrientation(
    PageSegMode pageseg_mode, BLOCK_LIST* blocks, Tesseract* osd_tess,
    OSResults* osr, TO_BLOCK_LIST* to_blocks, Pix** photo_mask_pix,
    Pix** music_mask_pix) {
  int vertical_x = 0;
  int vertical_y = 1;
  TabVector_LIST v_lines;
  TabVector_LIST h_lines;
  ICOORD bleft(0, 0);

  ASSERT_HOST(pix_binary_ != NULL);
  if (tessedit_dump_pageseg_images) {
    pixa_debug_.AddPix(pix_binary_, "PageSegInput");
  }
  // Leptonica is used to find the rule/separator lines in the input.
  LineFinder::FindAndRemoveLines(source_resolution_,
                                 textord_tabfind_show_vlines, pix_binary_,
                                 &vertical_x, &vertical_y, music_mask_pix,
                                 &v_lines, &h_lines);
  if (tessedit_dump_pageseg_images) {
    pixa_debug_.AddPix(pix_binary_, "NoLines");
  }
  // Leptonica is used to find a mask of the photo regions in the input.
  *photo_mask_pix = ImageFind::FindImages(pix_binary_, &pixa_debug_);
  if (tessedit_dump_pageseg_images) {
    pixa_debug_.AddPix(pix_binary_, "NoImages");
  }
  if (!PSM_COL_FIND_ENABLED(pageseg_mode)) v_lines.clear();

  // The rest of the algorithm uses the usual connected components.
  textord_.find_components(pix_binary_, blocks, to_blocks);

  TO_BLOCK_IT to_block_it(to_blocks);
  // There must be exactly one input block.
  // TODO(rays) handle new textline finding with a UNLV zone file.
  ASSERT_HOST(to_blocks->singleton());
  TO_BLOCK* to_block = to_block_it.data();
  TBOX blkbox = to_block->block->bounding_box();
  ColumnFinder* finder = NULL;
  int estimated_resolution = source_resolution_;
  if (source_resolution_ == kMinCredibleResolution) {
    // Try to estimate resolution from typical body text size.
    int res = IntCastRounded(to_block->line_size * kResolutionEstimationFactor);
    if (res > estimated_resolution && res < kMaxCredibleResolution) {
      estimated_resolution = res;
      tprintf("Estimating resolution as %d\n", estimated_resolution);
    }
  }

  if (to_block->line_size >= 2) {
    finder = new ColumnFinder(static_cast(to_block->line_size),
                              blkbox.botleft(), blkbox.topright(),
                              estimated_resolution, textord_use_cjk_fp_model,
                              textord_tabfind_aligned_gap_fraction, &v_lines,
                              &h_lines, vertical_x, vertical_y);

    finder->SetupAndFilterNoise(pageseg_mode, *photo_mask_pix, to_block);

    if (equ_detect_) {
      equ_detect_->LabelSpecialText(to_block);
    }

    BLOBNBOX_CLIST osd_blobs;
    // osd_orientation is the number of 90 degree rotations to make the
    // characters upright. (See osdetect.h for precise definition.)
    // We want the text lines horizontal, (vertical text indicates vertical
    // textlines) which may conflict (eg vertically written CJK).
    int osd_orientation = 0;
    bool vertical_text = textord_tabfind_force_vertical_text ||
                         pageseg_mode == PSM_SINGLE_BLOCK_VERT_TEXT;
    if (!vertical_text && textord_tabfind_vertical_text &&
        PSM_ORIENTATION_ENABLED(pageseg_mode)) {
      vertical_text =
          finder->IsVerticallyAlignedText(textord_tabfind_vertical_text_ratio,
                                          to_block, &osd_blobs);
    }
    if (PSM_OSD_ENABLED(pageseg_mode) && osd_tess != NULL && osr != NULL) {
      GenericVector osd_scripts;
      if (osd_tess != this) {
        // We are running osd as part of layout analysis, so constrain the
        // scripts to those allowed by *this.
        AddAllScriptsConverted(unicharset, osd_tess->unicharset, &osd_scripts);
        for (int s = 0; s < sub_langs_.size(); ++s) {
          AddAllScriptsConverted(sub_langs_[s]->unicharset,
                                 osd_tess->unicharset, &osd_scripts);
        }
      }
      os_detect_blobs(&osd_scripts, &osd_blobs, osr, osd_tess);
      if (pageseg_mode == PSM_OSD_ONLY) {
        delete finder;
        return NULL;
      }
      osd_orientation = osr->best_result.orientation_id;
      double osd_score = osr->orientations[osd_orientation];
      double osd_margin = min_orientation_margin * 2;
      for (int i = 0; i < 4; ++i) {
        if (i != osd_orientation &&
            osd_score - osr->orientations[i] < osd_margin) {
          osd_margin = osd_score - osr->orientations[i];
        }
      }
      int best_script_id = osr->best_result.script_id;
      const char* best_script_str =
          osd_tess->unicharset.get_script_from_script_id(best_script_id);
      bool cjk = best_script_id == osd_tess->unicharset.han_sid() ||
          best_script_id == osd_tess->unicharset.hiragana_sid() ||
          best_script_id == osd_tess->unicharset.katakana_sid() ||
          strcmp("Japanese", best_script_str) == 0 ||
          strcmp("Korean", best_script_str) == 0 ||
          strcmp("Hangul", best_script_str) == 0;
      if (cjk) {
        finder->set_cjk_script(true);
      }
      if (osd_margin < min_orientation_margin) {
        // The margin is weak.
        if (!cjk && !vertical_text && osd_orientation == 2) {
          // upside down latin text is improbable with such a weak margin.
          tprintf("OSD: Weak margin (%.2f), horiz textlines, not CJK: "
                  "Don't rotate.\n", osd_margin);
          osd_orientation = 0;
        } else {
          tprintf(
              "OSD: Weak margin (%.2f) for %d blob text block, "
              "but using orientation anyway: %d\n",
              osd_margin, osd_blobs.length(), osd_orientation);
        }
      }
    }
    osd_blobs.shallow_clear();
    finder->CorrectOrientation(to_block, vertical_text, osd_orientation);
  }

  return finder;
}

}  // namespace tesseract.

LineFinder::FindAndRemoveLines[textord/linefind.cpp]

// Finds vertical and horizontal line objects in the given pix.
// Uses the given resolution to determine size thresholds instead of any
// that may be present in the pix.
// The output vertical_x and vertical_y contain a sum of the output vectors,
// thereby giving the mean vertical direction.
// If pix_music_mask != NULL, and music is detected, a mask of the staves
// and anything that is connected (bars, notes etc.) will be returned in
// pix_music_mask, the mask subtracted from pix, and the lines will not
// appear in v_lines or h_lines.
// The output vectors are owned by the list and Frozen (cannot refit) by
// having no boxes, as there is no need to refit or merge separator lines.
// The detected lines are removed from the pix.
void LineFinder::FindAndRemoveLines(int resolution, bool debug, Pix* pix,
                                    int* vertical_x, int* vertical_y,
                                    Pix** pix_music_mask,
                                    TabVector_LIST* v_lines,
                                    TabVector_LIST* h_lines) {
  PERF_COUNT_START("FindAndRemoveLines")
  if (pix == NULL || vertical_x == NULL || vertical_y == NULL) {
    tprintf("Error in parameters for LineFinder::FindAndRemoveLines\n");
    return;
  }
  Pix* pix_vline = NULL;
  Pix* pix_non_vline = NULL;
  Pix* pix_hline = NULL;
  Pix* pix_non_hline = NULL;
  Pix* pix_intersections = NULL;
  Pixa* pixa_display = debug ? pixaCreate(0) : NULL;
  GetLineMasks(resolution, pix, &pix_vline, &pix_non_vline, &pix_hline,
               &pix_non_hline, &pix_intersections, pix_music_mask,
               pixa_display);
  // Find lines, convert to TabVector_LIST and remove those that are used.
  FindAndRemoveVLines(resolution, pix_intersections, vertical_x, vertical_y,
                      &pix_vline, pix_non_vline, pix, v_lines);
  if (pix_hline != NULL) {
    // Recompute intersections and re-filter false positive h-lines.
    if (pix_vline != NULL)
      pixAnd(pix_intersections, pix_vline, pix_hline);
    else
      pixDestroy(&pix_intersections);
    if (!FilterFalsePositives(resolution, pix_non_hline, pix_intersections,
                              pix_hline)) {
      pixDestroy(&pix_hline);
    }
  }
  FindAndRemoveHLines(resolution, pix_intersections, *vertical_x, *vertical_y,
                      &pix_hline, pix_non_hline, pix, h_lines);
  if (pixa_display != NULL && pix_vline != NULL)
    pixaAddPix(pixa_display, pix_vline, L_CLONE);
  if (pixa_display != NULL && pix_hline != NULL)
    pixaAddPix(pixa_display, pix_hline, L_CLONE);
  if (pix_vline != NULL && pix_hline != NULL) {
    // Remove joins (intersections) where lines cross, and the residue.
    // Recalculate the intersections, since some lines have been deleted.
    pixAnd(pix_intersections, pix_vline, pix_hline);
    // Fatten up the intersections and seed-fill to get the intersection
    // residue.
    Pix* pix_join_residue = pixDilateBrick(NULL, pix_intersections, 5, 5);
    pixSeedfillBinary(pix_join_residue, pix_join_residue, pix, 8);
    // Now remove the intersection residue.
    pixSubtract(pix, pix, pix_join_residue);
    pixDestroy(&pix_join_residue);
  }
  // Remove any detected music.
  if (pix_music_mask != NULL && *pix_music_mask != NULL) {
    if (pixa_display != NULL)
      pixaAddPix(pixa_display, *pix_music_mask, L_CLONE);
    pixSubtract(pix, pix, *pix_music_mask);
  }
  if (pixa_display != NULL)
    pixaAddPix(pixa_display, pix, L_CLONE);

  pixDestroy(&pix_vline);
  pixDestroy(&pix_non_vline);
  pixDestroy(&pix_hline);
  pixDestroy(&pix_non_hline);
  pixDestroy(&pix_intersections);
  if (pixa_display != NULL) {
    pixaConvertToPdf(pixa_display, resolution, 1.0f, 0, 0, "LineFinding",
                     "vhlinefinding.pdf");
    pixaDestroy(&pixa_display);
  }
  PERF_COUNT_END

ImageFind::FindImages [textord/linefind.cpp]

// Finds image regions within the BINARY source pix (page image) and returns
// the image regions as a mask image.
// The returned pix may be NULL, meaning no images found.
// If not NULL, it must be PixDestroyed by the caller.
// If textord_tabfind_show_images, debug images are appended to pixa_debug.
Pix* ImageFind::FindImages(Pix* pix, DebugPixa* pixa_debug) {
  // Not worth looking at small images.
  if (pixGetWidth(pix) < kMinImageFindSize ||
      pixGetHeight(pix) < kMinImageFindSize)
    return pixCreate(pixGetWidth(pix), pixGetHeight(pix), 1);

  // Reduce by factor 2.
  Pix *pixr = pixReduceRankBinaryCascade(pix, 1, 0, 0, 0);
  if (textord_tabfind_show_images && pixa_debug != nullptr)
    pixa_debug->AddPix(pixr, "CascadeReduced");

  // Get the halftone mask directly from Leptonica.
  //
  // Leptonica will print an error message and return NULL if we call
  // pixGenHalftoneMask(pixr, NULL, ...) with too small image, so we
  // want to bypass that.
  if (pixGetWidth(pixr) < kMinImageFindSize ||
      pixGetHeight(pixr) < kMinImageFindSize) {
    pixDestroy(&pixr);
    return pixCreate(pixGetWidth(pix), pixGetHeight(pix), 1);
  }
  // Get the halftone mask.
  l_int32 ht_found = 0;
  Pixa* pixadb = (textord_tabfind_show_images && pixa_debug != nullptr)
                     ? pixaCreate(0)
                     : nullptr;
  Pix* pixht2 = pixGenerateHalftoneMask(pixr, NULL, &ht_found, pixadb);
  if (pixadb) {
    Pix* pixdb = pixaDisplayTiledInColumns(pixadb, 3, 1.0, 20, 2);
    if (textord_tabfind_show_images && pixa_debug != nullptr)
      pixa_debug->AddPix(pixdb, "HalftoneMask");
    pixDestroy(&pixdb);
    pixaDestroy(&pixadb);
  }
  pixDestroy(&pixr);
  if (!ht_found && pixht2 != NULL)
    pixDestroy(&pixht2);
  if (pixht2 == NULL)
    return pixCreate(pixGetWidth(pix), pixGetHeight(pix), 1);

  // Expand back up again.
  Pix *pixht = pixExpandReplicate(pixht2, 2);
  if (textord_tabfind_show_images && pixa_debug != nullptr)
    pixa_debug->AddPix(pixht, "HalftoneReplicated");
  pixDestroy(&pixht2);

  // Fill to capture pixels near the mask edges that were missed
  Pix *pixt = pixSeedfillBinary(NULL, pixht, pix, 8);
  pixOr(pixht, pixht, pixt);
  pixDestroy(&pixt);

  // Eliminate lines and bars that may be joined to images.
  Pix* pixfinemask = pixReduceRankBinaryCascade(pixht, 1, 1, 3, 3);
  pixDilateBrick(pixfinemask, pixfinemask, 5, 5);
  if (textord_tabfind_show_images && pixa_debug != nullptr)
    pixa_debug->AddPix(pixfinemask, "FineMask");
  Pix* pixreduced = pixReduceRankBinaryCascade(pixht, 1, 1, 1, 1);
  Pix* pixreduced2 = pixReduceRankBinaryCascade(pixreduced, 3, 3, 3, 0);
  pixDestroy(&pixreduced);
  pixDilateBrick(pixreduced2, pixreduced2, 5, 5);
  Pix* pixcoarsemask = pixExpandReplicate(pixreduced2, 8);
  pixDestroy(&pixreduced2);
  if (textord_tabfind_show_images && pixa_debug != nullptr)
    pixa_debug->AddPix(pixcoarsemask, "CoarseMask");
  // Combine the coarse and fine image masks.
  pixAnd(pixcoarsemask, pixcoarsemask, pixfinemask);
  pixDestroy(&pixfinemask);
  // Dilate a bit to make sure we get everything.
  pixDilateBrick(pixcoarsemask, pixcoarsemask, 3, 3);
  Pix* pixmask = pixExpandReplicate(pixcoarsemask, 16);
  pixDestroy(&pixcoarsemask);
  if (textord_tabfind_show_images && pixa_debug != nullptr)
    pixa_debug->AddPix(pixmask, "MaskDilated");
  // And the image mask with the line and bar remover.
  pixAnd(pixht, pixht, pixmask);
  pixDestroy(&pixmask);
  if (textord_tabfind_show_images && pixa_debug != nullptr)
    pixa_debug->AddPix(pixht, "FinalMask");
  // Make the result image the same size as the input.
  Pix* result = pixCreate(pixGetWidth(pix), pixGetHeight(pix), 1);
  pixOr(result, result, pixht);
  pixDestroy(&pixht);
  return result;
}


Textord::find_components [textord/tordmain.cpp] 

/**********************************************************************
 * find_components
 *
 * Find the C_OUTLINEs of the connected components in each block, put them
 * in C_BLOBs, and filter them by size, putting the different size
 * grades on different lists in the matching TO_BLOCK in to_blocks.
 **********************************************************************/

void Textord::find_components(Pix* pix, BLOCK_LIST *blocks,
                              TO_BLOCK_LIST *to_blocks) {
  int width = pixGetWidth(pix);
  int height = pixGetHeight(pix);
  if (width > MAX_INT16 || height > MAX_INT16) {
    tprintf("Input image too large! (%d, %d)\n", width, height);
    return;  // Can't handle it.
  }

  set_global_loc_code(LOC_EDGE_PROG);

  BLOCK_IT block_it(blocks);    // iterator
  for (block_it.mark_cycle_pt(); !block_it.cycled_list();
       block_it.forward()) {
    BLOCK* block = block_it.data();
    if (block->poly_block() == NULL || block->poly_block()->IsText()) {
      extract_edges(pix, block);
    }
  }

  assign_blobs_to_blocks2(pix, blocks, to_blocks);
  ICOORD page_tr(width, height);
  filter_blobs(page_tr, to_blocks, !textord_test_landscape);
}





你可能感兴趣的:(Tesseract4.0)