最快的素数筛选法

 http://bailuzhou.blog.163.com/blog/static/536613592007101213946727/

 
题目有点大, 主要为了便于检索到, 小素数筛选法通常采用eratosthenes方法,复杂度
 
为 n * lglgn, 不同实现性能往往差别很大, 看过网上不少人写的筛选法, 大多比较初级。
 
本文采用分段筛选, 多线程, 汇编, 位压缩等优化手段。在双核CPU 2.0G上2秒算出 2^31
 
 以内素数个数, 目前还没有实现存储,下个版本实现之。
 
/******************************************************************
Copyright (C) 2007 Huang Yuanbing
version 1.3, 2007 PrimeNumber
mailto: bailuzhou.dot.163.com
free use for non-commercial purposes
******************************************************************
 
describition:
input two number beg, end (0 < beg <= end < 2^32)
claculate number of primes in the interval [beg, end]
for emxaple:
PI[1, 2] = 1, only 2 is prime in this subinterval
PI[1, 2^31] = 105097565, there are 105097565 primes in this interval
 
basic ideal:
there are two algorithms in this file, as for the first
algorithm (ALGORITHM = 1), I do not want to describe the ideal here,
even this algorithm is about (30 - 50)% faster the second one for
lager input interval, but it has a very bad performance for many case
and also can not save each primes if you want them.
 
as for the second one (ALGORITHM = 2):
this is a improved segmented sieve of eratosthenes,
using multi thread and asm, bit compress to optimize the classical alrorithm.
for save memory and performance, primes are not saved, but it's easy to do that.
for a given subinterval[beg, end] and runing thread numbers pn the ith thread
calculate subinterval [beg + (end - beg) / pn * (i - 1), beg + (end - beg) / pn * i),
(pay attention the last thead) and the main thread add the result which get from
the other thread.
 
the complextity for function PI(0, n) is n * ln(lnx) and space complextity is sqrt(n)
this algorithm can also be implemented by MPI, openMP and multi proceess(IPC)
(just finished some parts).
 
this source file can be compiled by gcc and vc++
and running on windows and unix and linux(without marco ASM)
for some older cpu the marco ASM can induces bad performance.
there are may some bugs for it has not been fully tested.
at present the fastest prime algorithme as i know is ecprime you can get it from website:
http://www.primzahlen.de/files/referent/kw/ecprime.source.zip
******************************************************************
******************************************************************/
 
# include <stdio.h>
# include <stdlib.h>
# include <time.h>
# include <math.h>
# include <memory.h>
# include <assert.h>
 
#define ALGORITHM 2
// the algorithm 1
#define MULTI_THREAD
// multi-core CPU optimaztion
#define ASM0
// assemble optimaztion
 
#define COMP 4
#define MASK 7
#define MAXP 6600
#if (COMP == 3)
 #define MASKN(n) (1 << (n & MASK))
#else
 #define MASKN(n) (1 << ((n >> 1) & MASK))
#endif
 
#define THREAD_NUM 4
//the running threads
 
#define DEFAULT_N ((1u << 31) - 1)
//the input range [0, DEFAULT_N]
 
typedef unsigned int uint;
unsigned char bits[1 << 8];
   int Prime[MAXP]; // 0 - 2^16
 
int sieve(int);
int PI(int, int);
 
#ifdef MULTI_THREAD
 
struct ThreadInfo
{
 int beg, end;
 int pnums;
}Threadparam[THREAD_NUM * 2 + 2];
 
#ifdef _WIN32
 # include <windows.h>
DWORD WINAPI Win32ThreadFun(LPVOID pinfo)
#else
 # include <pthread.h>
void* POSIXThreadFun(void *pinfo)
#endif
{
 ThreadInfo *pThreadInfo = (ThreadInfo *) (pinfo);
 pThreadInfo->pnums = PI(pThreadInfo->beg, pThreadInfo->end);
 printf("PI[%10d, %10d] = %d\n", pThreadInfo->beg, pThreadInfo->end, pThreadInfo->pnums);
 return 0;
}
 
int init_pm(int tpnums, uint maxn, int blocksize)
{
 Threadparam[0].end = (maxn / tpnums) - (maxn / tpnums) % blocksize
  + (tpnums - 1) * blocksize * tpnums / 2;
 int bsize = Threadparam[0].end;
 for (int i = 1; i < tpnums; i++){
  Threadparam[i].beg = Threadparam[i - 1].end;
  Threadparam[i].end = Threadparam[i].beg + (bsize -= tpnums * blocksize);
 }
 Threadparam[0].beg = 2;
 Threadparam[tpnums - 1].end = maxn;
 return 0;
}
 
int multiThread(int theadnums, uint maxn)
{
 int i, pnums = 0;
#if ALGORITHM == 1
 for (i = 1; i < theadnums; i++)
  Threadparam[i].beg = Threadparam[i - 1].end = (maxn / theadnums) * i;
 Threadparam[0].beg = 2;
 Threadparam[theadnums - 1].end = maxn;
#else
 init_pm(theadnums, maxn, 255255 << 2);
#endif
 
#ifdef _WIN32
 HANDLE tHand[THREAD_NUM * 2];
 DWORD threadID[THREAD_NUM * 2];
 for (i = 0; i < theadnums; i++){
  tHand[i] = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)Win32ThreadFun, (LPVOID)(&Threadparam[i]), 0, &threadID[i]);
  if (tHand[i] == NULL)
   printf("create Win32 thread error\n");
 }
 WaitForMultipleObjects(theadnums, tHand, true, INFINITE);
 for (i = 0; i < theadnums; i++){
  pnums += Threadparam[i].pnums;
  CloseHandle(tHand[i]);
 }
#else
 pthread_t tid[THREAD_NUM * 2];
 for (i = 0; i < theadnums; i++){
  int error = pthread_create(&tid[i], NULL, POSIXThreadFun, &Threadparam[i]);
  if ( error != 0 )
   printf("Create pthread error: %d\n", error);
 }
 for (i = 0; i < theadnums; i++){
  pthread_join(tid[i], NULL);
  pnums += Threadparam[i].pnums;
 }
#endif
 return pnums;
}
#endif
 
#ifdef ASM
void asmSetBits(unsigned char mask[], int next, int len, int p)
{
#ifdef _CONSOLE
 __asm
 {
#if (COMP == 4 || ALGORITHM == 1)
  //    mov esi, len
  mov edx, p
  mov esi, mask
  mov eax, next
$loop1:
  mov edi, eax
  mov ecx, eax
#if (ALGORITHM != 1 && COMP == 4)
  shr edi, COMP
  shr ecx, 1
#else
  shr edi, 3
#endif
  mov ebx, 1
  and ecx, MASK
  shl ebx, cl
  or byte ptr [esi + edi], bl
  add eax, edx
  cmp eax, len
  jl $loop1
#else
  mov ebx, [mask]
  mov esi, len
  mov eax, next
  mov edi, p
$loop2:
  bts [ebx], eax
  add eax, edi
  cmp eax, esi
  jl $loop2
#endif
 }
#else
 __asm
 (
#if (COMP == 4 || ALGORITHM == 1)
   "movl %1, %%esi\n"
   "movl %4, %%edx\n"
   "movl %2, %%eax\n"
"Loop1:\n"
   "movl %%eax, %%edi\n"
   "movl %%eax, %%ecx\n"
#if (ALGORITHM != 1 && COMP == 4)
   "shrl $4, %%edi\n"
   "shrb $1, %%cl\n"
#else
   "shrl $3, %%edi\n"
#endif
   "movb $1, %%bl\n"
   "andb $7, %%cl\n"
   "shlb %%cl, %%bl\n"
   "orb  %%bl, (%%esi, %%edi)\n"
   "addl %%edx, %%eax\n"
   "cmpl %3, %%eax\n"
   "jl Loop1\n"
#else
   "leal (%1), %%eax\n"
   "movl %3,  %%esi\n"
   "movl %2, %%ebx\n"
   "movl %4, %%edi\n"
"loop2:\n"
   "btsl %%ebx, (%%eax)\n"
   "addl %%edi, %%ebx\n"
   "cmpl %%esi, %%ebx\n"
   "jl loop2\n"
#endif
   : "=m" (p)
   : "r" (mask), "g" (next), "g" (len),"g" (p)
   : "ax", "memory", "%eax", "%ebx", "%ecx", "%edx", "%edi"
 );
#endif
 //    return;
}
 
int asmSum(const unsigned char mask[], int len)
{
 int pnums = 0;
#ifdef _CONSOLE
 __asm
 {
  mov esi, mask
  mov ecx, len
  xor edx, edx
  xor eax, eax
  xor ebx, ebx
$loopsum:
  mov al, byte ptr [esi + ecx]
  mov bl, byte ptr bits[eax]
  add edx, ebx
  loop $loopsum
  mov pnums, edx
 }
#else
 __asm
 (
   "movl %1, %%ecx\n"
   "xorl %%edx, %%edx\n"
   "movl %2, %%edi\n"
   "movl %3, %%esi\n"
   "xorl %%eax, %%eax\n"
   "xorl %%ebx, %%ebx\n"
"loopsum:\n"
   "movb (%%edi, %%ecx), %%al\n"
   "movb (%%esi, %%eax), %%bl\n"
   "addl %%ebx, %%edx\n"
   "loop loopsum\n"
   "movl %%edx, %0\n"
   : "=m" (pnums)
   : "g" (len), "g" (mask), "g"(bits)
   : "ax", "memory", "%eax", "%ebx", "%ecx", "%edx", "%edi"
 );
#endif
 return pnums;
}
#endif
 
/************************************************************************/
/*                          ALGORITHM1                                  */
/************************************************************************/
#if (ALGORITHM == 1)
 
#define SP 7
#define BLOCKSIZE 30030
 
unsigned char BaseTpl[(1 << 16) >> COMP];
int mprimes[MAXP];
 
void solve(int a, int b, int c, int &x, int &y)
{
 if (a == 0){
  x = 0;
  y = c / b;
 }else{
  solve(b % a, a, c, y, x);
  x -= b / a * y;
 }
}
 
int init(int ans[], int beg, int last)
{
#if 1
 if ((beg & 1) == 0 || BaseTpl[beg >> COMP] & MASKN(beg))
  return 1;
#else
 for (int i = 1; i < SP; i++){
  if (beg % Prime[i] == 0)
   return 1;
 }
#endif
 
#if 1
 for (int k = SP, dis = beg - last; k <= Prime[0]; ++k)
  ans[k] = (ans[k] + mprimes[k] * dis) % Prime[k];
#else
 __asm
 {
  mov ebx, SP
  mov ecx, beg
  sub ecx, last
  mov esi, ans
  mov edi, Prime[0]
  shl edi, 2
$LOOP:
  mov eax, [mprimes + ebx]
  imul ecx
  add eax, [esi + ebx]
  idiv [Prime  + ebx]
  mov [esi + ebx], edx
  add ebx, 4
  cmp ebx, edi
  jle $LOOP
 }
#endif
 return 0;
}
 
int PI(int beg, int end)
{
 int ans[MAXP] = {0}; 
 int pnums = 0, last = 0;
 for (; beg < end; beg++){
  if (init (ans, beg, last))
   continue;
  unsigned char mask[MAXP * 2] = {0};
  uint len = (DEFAULT_N - (last = beg)) / BLOCKSIZE;
  for (int k = SP; k <= Prime[0]; ++k){
   const uint p = Prime[k];
   uint next = (ans[k] == 0) ? p - 1 : ans[k] - 1;
#ifndef ASM
   for (; next < len; next += p)
    mask[next >> 3] |= 1 << (next & MASK);
#else
   if (next < len)
    asmSetBits(mask, next, len, p);
#endif
  }
  int size = (DEFAULT_N / BLOCKSIZE + 7) >> 3;
  pnums -= bits[mask[0]];
#ifndef ASM
  for (int kk = 1; kk < size; ++kk)
   len -= bits[mask[kk]];
  pnums += len;
#else
  pnums += len - asmSum(mask, size);
#endif
 }
 return pnums;
}
 
int main(int arg, char **argc)
{
 clock_t tstart = clock();
 
 int i, pnums;
 sieve(DEFAULT_N);
 //init bits
 for (i = 1; i < (int)(sizeof(bits) / sizeof(bits[0])); ++i)
  bits[i] = bits[i >> 1] + (i & 1);
 
 for (i = SP; i <= Prime[0]; ++i){
  solve(Prime[i], -BLOCKSIZE, 1, mprimes[0],  mprimes[i]);
  mprimes[i] = (mprimes[i] % (int)Prime[i] + Prime[i]) % Prime[i];
 }
#ifdef MULTI_THREAD
 if (arg > 1)
  pnums = multiThread(atoi(argc[1]), BLOCKSIZE);
 else
  pnums = multiThread(THREAD_NUM, BLOCKSIZE);
#else
 pnums = PI(0, BLOCKSIZE);
#endif
 printf("PI1[%u] : primes = %d, time use %ld ms\n", (uint)DEFAULT_N, (pnums + Prime[0]), clock() - tstart);
 return 0;
}
 
#else
 
#ifdef S6
 #define FACT 15015 * 16 // 3 * 5 * 7 * 11 * 13 = 15015
 #define SP 6
#else
 #define FACT 255255     // 3 * 5 * 7 * 11 * 13 * 17 = 255255
 #define SP 7
#endif
 
#define BLOCKSIZE (FACT << 2)
#define MAXM ((BLOCKSIZE >> COMP) + 2)
unsigned char BaseTpl[MAXM]; //the table the fist SP primes is removed
unsigned char bitsIndex[1 << 8][5] = {0};
 
/******************************
// 1  3  5  7  9 11 13 15
// 0  1  1  1  1  1  1  1  6    0xfe  mask[0] 
//17 19 21 23 25 27 29 31
// 1  1  0  1  0  0  1  1  5    0xca  mask[1]
//33 35 37 39 41 43 45 47
// 0  0  1  0  1  1  0  1  4    0xb4  mask[2]
//49 51 53 55 57 59 61 63 
// 0  0  1  0  0  1  1  0  3    0x64  mask[3]
******************************/
 
//#include <assert.h>
int outPrint(unsigned char mask[], int start, uint len)
{
 for (uint i = 0; i < len; i += 1 << COMP){
  int bi = i >> COMP;
#if 0
  assert(bits[mask[bi]] <= 5);
  for (uint j = i + 1; j < i + 16; j += 2){
   if ( !(mask[bi] & MASKN(j)) )
    printf("%d ", start + (bi << COMP) + 2 * ((j >> 1) & 7) + 1);
  }
#else
  if (bitsIndex[mask[bi]][0] == 0)
   continue;
  for (uint j = mask[bi], k = 0; bitsIndex[j][k]; k++)
   printf("%d ", start + (bi << COMP) + 2 * bitsIndex[j][k] - 1);
#endif
  putchar('\n');
 }
 return 0;
}
 
int piRange(int start, uint len = BLOCKSIZE)
{
 int srid = start % BLOCKSIZE;
 len += srid;
 
 uint next, pnums = 0;
 bool ok = start >= BLOCKSIZE;
 const int maxp = (int)sqrt((float)(start + len)) + 1;
 unsigned char mask[MAXM + 4];
 memcpy(mask, BaseTpl, (len >> COMP) + 1);
 mask[len >> COMP] |= ~(MASKN(len) - 1);
 if (srid)
  mask[srid >> COMP] |= (MASKN(srid) - 1);
 
 for (int i = SP + 1, beg = srid - 1, p = Prime[i]; p < maxp; p = Prime[++i]){
  if (ok){
   next = beg + p - (start - 1) % p;
   if ((next & 1) == 0)
    next += p;
  }else
   next = p * p;
  p <<= 1;
#ifdef ASM
  if (next < len)
   asmSetBits(mask , next, len , p);
#else
  for (; next < len; next += p)
   mask[next >> COMP] |= MASKN(next);
#endif
 }
 int size = len >> COMP;
 for (int k = (srid >> COMP); k <= size; k++)
  pnums += bits[mask[k]];
 return pnums;
}
 
 
int P7(int beg, int end)
{
 int pnums = 0;
#if 0
 for (int j = SP; beg <= Prime[j] && j > 0; j--){
  if (end >= Prime[j])
   pnums++;
 }
#else
//            2  3     5     7          11    13           17   19
 static int Prime19[20] = {0, 0, 1, 2, 2, 3, 3, 4, 4, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 8};
 if (end > Prime[SP])
  pnums += Prime19[Prime[SP]] - Prime19[beg - 1];
 else
  pnums += Prime19[end] - Prime19[beg - 1];
#endif
 return pnums;
}
 
int PI(int beg, int end)
{
 int pnums = 0;
 if (beg > end || end > DEFAULT_N)
  return -1;
 if (beg < 2)
  beg = 2;
 if (beg <= Prime[SP])
  pnums = P7(beg, end);
 if (beg / BLOCKSIZE == end / BLOCKSIZE){
  pnums += piRange(beg, end - beg + 1);
  return pnums;
 }
 int size = end % BLOCKSIZE;
 if (size != 0)
  pnums += piRange(end - size, size + 1);
 size = beg % BLOCKSIZE;
 if (size != 0){
  pnums += piRange(beg, BLOCKSIZE - size);
  beg += BLOCKSIZE - size;
 }
 beg /= BLOCKSIZE, end /= BLOCKSIZE;
 for (int i = beg; i < end; i++){
  static int pcache[DEFAULT_N / BLOCKSIZE + 2] = {0};
  if (pcache[i] == 0)
   pcache[i] = piRange(i * BLOCKSIZE);
  pnums += pcache[i];
 }
 return pnums;
}
 
void initBits()
{
 int i;
 int bitsize = sizeof(bits) / sizeof(bits[0]);
 for (i = 1; i < bitsize; i++)
  bits[i] = bits[i >> 1] + (i & 1);
 for (i = 0; i < bitsize; i++)
  bits[i] = 8 - bits[i];
 
 for (i = 0; i < bitsize; i++){
  int num = i, cnt = 0;
  if (bits[num] > 5)
   continue;
  for (int j = 0; num; j++){
   if ( !(num & 1) )
    bitsIndex[i][cnt++] = j + 1;
   num >>= 1;
  }
//  printf("i = %d, cnt = %d\n", i, cnt);
 }
}
 
#include <windows.h>
 
int main(int arg, char **argc)
{
 clock_t tstart = clock();
 int pnums = 0;
 initBits();
 sieve(DEFAULT_N);
 
#ifdef MULTI_THREAD
    if (DEFAULT_N > THREAD_NUM * BLOCKSIZE){
  if (arg > 1)
   pnums = multiThread(atoi(argc[1]), DEFAULT_N);
  else
   pnums = multiThread(THREAD_NUM, DEFAULT_N);
 }else
  pnums = PI(0, DEFAULT_N);
#else
 pnums = PI(0, DEFAULT_N);
#endif
 printf("PI2[0 - %u] : primes = %d, time use %ld ms\n", (uint)DEFAULT_N, pnums, clock() - tstart);
 
 //for test the result
 int beg, end;
 while (scanf ("%d %d", &beg, &end) == 2 && beg <= end){
  tstart = clock();
  printf("PI[%d, %d] = ", beg, end);
  pnums = PI(beg, end);
  printf("%d, time use %d ms\n", pnums, clock() - tstart);
 
 }
 return 0;
}
#endif
 
int sieve(int maxn)
{
 uint p, primes = 1;
 uint maxp = (uint)sqrt((float)maxn) + 19;
 
#if (ALGORITHM == 1)
 if (maxp < BLOCKSIZE)
  maxp = BLOCKSIZE + 10;
#endif
 Prime[1] = 2;
 for (p = 3; p < maxp; p += 2){
  if ( !(BaseTpl[p >> COMP] & MASKN(p)) ){
   Prime[++primes] = p;
   for (uint j = p * p; j < maxp; j += p << 1)
    BaseTpl[j >> COMP] |= MASKN(j);
  }
 }
#if (COMP == 4)
 memset(BaseTpl, 0, sizeof(BaseTpl));
#else
 memset(BaseTpl, 0x55, sizeof(BaseTpl));
#endif
 for (int i = 2; i <= SP; i++){
//  printf("%d\n", Prime[i]);
  for (int j = Prime[i], p = j; j < BLOCKSIZE; j += p << 1)
   BaseTpl[j >> COMP] |= MASKN(j);//printf("j = %d\n", j);
 }
 Prime[primes + 1] = maxn;
 Prime[0] = primes;
 return primes;
}

源地址: http://www.mwtee.com/blog-18294-2560.html

你可能感兴趣的:(素数,素数筛选法)