题目有点大, 主要为了便于检索到, 小素数筛选法通常采用eratosthenes方法,复杂度
为 n * lglgn, 不同实现性能往往差别很大, 看过网上不少人写的筛选法, 大多比较初级。
本文采用分段筛选, 多线程, 汇编, 位压缩等优化手段。在双核CPU 2.0G上2秒算出 2^31
以内素数个数, 目前还没有实现存储,下个版本实现之。
/******************************************************************
Copyright (C) 2007 Huang Yuanbing
version 1.3, 2007 PrimeNumber
mailto: bailuzhou.dot.163.com
free use for non-commercial purposes
******************************************************************
describition:
input two number beg, end (0 < beg <= end < 2^32)
claculate number of primes in the interval [beg, end]
for emxaple:
PI[1, 2] = 1, only 2 is prime in this subinterval
PI[1, 2^31] = 105097565, there are 105097565 primes in this interval
basic ideal:
there are two algorithms in this file, as for the first
algorithm (ALGORITHM = 1), I do not want to describe the ideal here,
even this algorithm is about (30 - 50)% faster the second one for
lager input interval, but it has a very bad performance for many case
and also can not save each primes if you want them.
as for the second one (ALGORITHM = 2):
this is a improved segmented sieve of eratosthenes,
using multi thread and asm, bit compress to optimize the classical alrorithm.
for save memory and performance, primes are not saved, but it's easy to do that.
for a given subinterval[beg, end] and runing thread numbers pn the ith thread
calculate subinterval [beg + (end - beg) / pn * (i - 1), beg + (end - beg) / pn * i),
(pay attention the last thead) and the main thread add the result which get from
the other thread.
the complextity for function PI(0, n) is n * ln(lnx) and space complextity is sqrt(n)
this algorithm can also be implemented by MPI, openMP and multi proceess(IPC)
(just finished some parts).
this source file can be compiled by gcc and vc++
and running on windows and unix and linux(without marco ASM)
for some older cpu the marco ASM can induces bad performance.
there are may some bugs for it has not been fully tested.
at present the fastest prime algorithme as i know is ecprime you can get it from website:
http://www.primzahlen.de/files/referent/kw/ecprime.source.zip
******************************************************************
******************************************************************/
# include <stdio.h>
# include <stdlib.h>
# include <time.h>
# include <math.h>
# include <memory.h>
# include <assert.h>
#define ALGORITHM 2
// the algorithm 1
#define MULTI_THREAD
// multi-core CPU optimaztion
#define ASM0
// assemble optimaztion
#define COMP 4
#define MASK 7
#define MAXP 6600
#if (COMP == 3)
#define MASKN(n) (1 << (n & MASK))
#else
#define MASKN(n) (1 << ((n >> 1) & MASK))
#endif
#define THREAD_NUM 4
//the running threads
#define DEFAULT_N ((1u << 31) - 1)
//the input range [0, DEFAULT_N]
typedef unsigned int uint;
unsigned char bits[1 << 8];
int Prime[MAXP]; // 0 - 2^16
int sieve(int);
int PI(int, int);
#ifdef MULTI_THREAD
struct ThreadInfo
{
int beg, end;
int pnums;
}Threadparam[THREAD_NUM * 2 + 2];
#ifdef _WIN32
# include <windows.h>
DWORD WINAPI Win32ThreadFun(LPVOID pinfo)
#else
# include <pthread.h>
void* POSIXThreadFun(void *pinfo)
#endif
{
ThreadInfo *pThreadInfo = (ThreadInfo *) (pinfo);
pThreadInfo->pnums = PI(pThreadInfo->beg, pThreadInfo->end);
printf("PI[%10d, %10d] = %d\n", pThreadInfo->beg, pThreadInfo->end, pThreadInfo->pnums);
return 0;
}
int init_pm(int tpnums, uint maxn, int blocksize)
{
Threadparam[0].end = (maxn / tpnums) - (maxn / tpnums) % blocksize
+ (tpnums - 1) * blocksize * tpnums / 2;
int bsize = Threadparam[0].end;
for (int i = 1; i < tpnums; i++){
Threadparam[i].beg = Threadparam[i - 1].end;
Threadparam[i].end = Threadparam[i].beg + (bsize -= tpnums * blocksize);
}
Threadparam[0].beg = 2;
Threadparam[tpnums - 1].end = maxn;
return 0;
}
int multiThread(int theadnums, uint maxn)
{
int i, pnums = 0;
#if ALGORITHM == 1
for (i = 1; i < theadnums; i++)
Threadparam[i].beg = Threadparam[i - 1].end = (maxn / theadnums) * i;
Threadparam[0].beg = 2;
Threadparam[theadnums - 1].end = maxn;
#else
init_pm(theadnums, maxn, 255255 << 2);
#endif
#ifdef _WIN32
HANDLE tHand[THREAD_NUM * 2];
DWORD threadID[THREAD_NUM * 2];
for (i = 0; i < theadnums; i++){
tHand[i] = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)Win32ThreadFun, (LPVOID)(&Threadparam[i]), 0, &threadID[i]);
if (tHand[i] == NULL)
printf("create Win32 thread error\n");
}
WaitForMultipleObjects(theadnums, tHand, true, INFINITE);
for (i = 0; i < theadnums; i++){
pnums += Threadparam[i].pnums;
CloseHandle(tHand[i]);
}
#else
pthread_t tid[THREAD_NUM * 2];
for (i = 0; i < theadnums; i++){
int error = pthread_create(&tid[i], NULL, POSIXThreadFun, &Threadparam[i]);
if ( error != 0 )
printf("Create pthread error: %d\n", error);
}
for (i = 0; i < theadnums; i++){
pthread_join(tid[i], NULL);
pnums += Threadparam[i].pnums;
}
#endif
return pnums;
}
#endif
#ifdef ASM
void asmSetBits(unsigned char mask[], int next, int len, int p)
{
#ifdef _CONSOLE
__asm
{
#if (COMP == 4 || ALGORITHM == 1)
// mov esi, len
mov edx, p
mov esi, mask
mov eax, next
$loop1:
mov edi, eax
mov ecx, eax
#if (ALGORITHM != 1 && COMP == 4)
shr edi, COMP
shr ecx, 1
#else
shr edi, 3
#endif
mov ebx, 1
and ecx, MASK
shl ebx, cl
or byte ptr [esi + edi], bl
add eax, edx
cmp eax, len
jl $loop1
#else
mov ebx, [mask]
mov esi, len
mov eax, next
mov edi, p
$loop2:
bts [ebx], eax
add eax, edi
cmp eax, esi
jl $loop2
#endif
}
#else
__asm
(
#if (COMP == 4 || ALGORITHM == 1)
"movl %1, %%esi\n"
"movl %4, %%edx\n"
"movl %2, %%eax\n"
"Loop1:\n"
"movl %%eax, %%edi\n"
"movl %%eax, %%ecx\n"
#if (ALGORITHM != 1 && COMP == 4)
"shrl $4, %%edi\n"
"shrb $1, %%cl\n"
#else
"shrl $3, %%edi\n"
#endif
"movb $1, %%bl\n"
"andb $7, %%cl\n"
"shlb %%cl, %%bl\n"
"orb %%bl, (%%esi, %%edi)\n"
"addl %%edx, %%eax\n"
"cmpl %3, %%eax\n"
"jl Loop1\n"
#else
"leal (%1), %%eax\n"
"movl %3, %%esi\n"
"movl %2, %%ebx\n"
"movl %4, %%edi\n"
"loop2:\n"
"btsl %%ebx, (%%eax)\n"
"addl %%edi, %%ebx\n"
"cmpl %%esi, %%ebx\n"
"jl loop2\n"
#endif
: "=m" (p)
: "r" (mask), "g" (next), "g" (len),"g" (p)
: "ax", "memory", "%eax", "%ebx", "%ecx", "%edx", "%edi"
);
#endif
// return;
}
int asmSum(const unsigned char mask[], int len)
{
int pnums = 0;
#ifdef _CONSOLE
__asm
{
mov esi, mask
mov ecx, len
xor edx, edx
xor eax, eax
xor ebx, ebx
$loopsum:
mov al, byte ptr [esi + ecx]
mov bl, byte ptr bits[eax]
add edx, ebx
loop $loopsum
mov pnums, edx
}
#else
__asm
(
"movl %1, %%ecx\n"
"xorl %%edx, %%edx\n"
"movl %2, %%edi\n"
"movl %3, %%esi\n"
"xorl %%eax, %%eax\n"
"xorl %%ebx, %%ebx\n"
"loopsum:\n"
"movb (%%edi, %%ecx), %%al\n"
"movb (%%esi, %%eax), %%bl\n"
"addl %%ebx, %%edx\n"
"loop loopsum\n"
"movl %%edx, %0\n"
: "=m" (pnums)
: "g" (len), "g" (mask), "g"(bits)
: "ax", "memory", "%eax", "%ebx", "%ecx", "%edx", "%edi"
);
#endif
return pnums;
}
#endif
/************************************************************************/
/* ALGORITHM1 */
/************************************************************************/
#if (ALGORITHM == 1)
#define SP 7
#define BLOCKSIZE 30030
unsigned char BaseTpl[(1 << 16) >> COMP];
int mprimes[MAXP];
void solve(int a, int b, int c, int &x, int &y)
{
if (a == 0){
x = 0;
y = c / b;
}else{
solve(b % a, a, c, y, x);
x -= b / a * y;
}
}
int init(int ans[], int beg, int last)
{
#if 1
if ((beg & 1) == 0 || BaseTpl[beg >> COMP] & MASKN(beg))
return 1;
#else
for (int i = 1; i < SP; i++){
if (beg % Prime[i] == 0)
return 1;
}
#endif
#if 1
for (int k = SP, dis = beg - last; k <= Prime[0]; ++k)
ans[k] = (ans[k] + mprimes[k] * dis) % Prime[k];
#else
__asm
{
mov ebx, SP
mov ecx, beg
sub ecx, last
mov esi, ans
mov edi, Prime[0]
shl edi, 2
$LOOP:
mov eax, [mprimes + ebx]
imul ecx
add eax, [esi + ebx]
idiv [Prime + ebx]
mov [esi + ebx], edx
add ebx, 4
cmp ebx, edi
jle $LOOP
}
#endif
return 0;
}
int PI(int beg, int end)
{
int ans[MAXP] = {0};
int pnums = 0, last = 0;
for (; beg < end; beg++){
if (init (ans, beg, last))
continue;
unsigned char mask[MAXP * 2] = {0};
uint len = (DEFAULT_N - (last = beg)) / BLOCKSIZE;
for (int k = SP; k <= Prime[0]; ++k){
const uint p = Prime[k];
uint next = (ans[k] == 0) ? p - 1 : ans[k] - 1;
#ifndef ASM
for (; next < len; next += p)
mask[next >> 3] |= 1 << (next & MASK);
#else
if (next < len)
asmSetBits(mask, next, len, p);
#endif
}
int size = (DEFAULT_N / BLOCKSIZE + 7) >> 3;
pnums -= bits[mask[0]];
#ifndef ASM
for (int kk = 1; kk < size; ++kk)
len -= bits[mask[kk]];
pnums += len;
#else
pnums += len - asmSum(mask, size);
#endif
}
return pnums;
}
int main(int arg, char **argc)
{
clock_t tstart = clock();
int i, pnums;
sieve(DEFAULT_N);
//init bits
for (i = 1; i < (int)(sizeof(bits) / sizeof(bits[0])); ++i)
bits[i] = bits[i >> 1] + (i & 1);
for (i = SP; i <= Prime[0]; ++i){
solve(Prime[i], -BLOCKSIZE, 1, mprimes[0], mprimes[i]);
mprimes[i] = (mprimes[i] % (int)Prime[i] + Prime[i]) % Prime[i];
}
#ifdef MULTI_THREAD
if (arg > 1)
pnums = multiThread(atoi(argc[1]), BLOCKSIZE);
else
pnums = multiThread(THREAD_NUM, BLOCKSIZE);
#else
pnums = PI(0, BLOCKSIZE);
#endif
printf("PI1[%u] : primes = %d, time use %ld ms\n", (uint)DEFAULT_N, (pnums + Prime[0]), clock() - tstart);
return 0;
}
#else
#ifdef S6
#define FACT 15015 * 16 // 3 * 5 * 7 * 11 * 13 = 15015
#define SP 6
#else
#define FACT 255255 // 3 * 5 * 7 * 11 * 13 * 17 = 255255
#define SP 7
#endif
#define BLOCKSIZE (FACT << 2)
#define MAXM ((BLOCKSIZE >> COMP) + 2)
unsigned char BaseTpl[MAXM]; //the table the fist SP primes is removed
unsigned char bitsIndex[1 << 8][5] = {0};
/******************************
// 1 3 5 7 9 11 13 15
// 0 1 1 1 1 1 1 1 6 0xfe mask[0]
//17 19 21 23 25 27 29 31
// 1 1 0 1 0 0 1 1 5 0xca mask[1]
//33 35 37 39 41 43 45 47
// 0 0 1 0 1 1 0 1 4 0xb4 mask[2]
//49 51 53 55 57 59 61 63
// 0 0 1 0 0 1 1 0 3 0x64 mask[3]
******************************/
//#include <assert.h>
int outPrint(unsigned char mask[], int start, uint len)
{
for (uint i = 0; i < len; i += 1 << COMP){
int bi = i >> COMP;
#if 0
assert(bits[mask[bi]] <= 5);
for (uint j = i + 1; j < i + 16; j += 2){
if ( !(mask[bi] & MASKN(j)) )
printf("%d ", start + (bi << COMP) + 2 * ((j >> 1) & 7) + 1);
}
#else
if (bitsIndex[mask[bi]][0] == 0)
continue;
for (uint j = mask[bi], k = 0; bitsIndex[j][k]; k++)
printf("%d ", start + (bi << COMP) + 2 * bitsIndex[j][k] - 1);
#endif
putchar('\n');
}
return 0;
}
int piRange(int start, uint len = BLOCKSIZE)
{
int srid = start % BLOCKSIZE;
len += srid;
uint next, pnums = 0;
bool ok = start >= BLOCKSIZE;
const int maxp = (int)sqrt((float)(start + len)) + 1;
unsigned char mask[MAXM + 4];
memcpy(mask, BaseTpl, (len >> COMP) + 1);
mask[len >> COMP] |= ~(MASKN(len) - 1);
if (srid)
mask[srid >> COMP] |= (MASKN(srid) - 1);
for (int i = SP + 1, beg = srid - 1, p = Prime[i]; p < maxp; p = Prime[++i]){
if (ok){
next = beg + p - (start - 1) % p;
if ((next & 1) == 0)
next += p;
}else
next = p * p;
p <<= 1;
#ifdef ASM
if (next < len)
asmSetBits(mask , next, len , p);
#else
for (; next < len; next += p)
mask[next >> COMP] |= MASKN(next);
#endif
}
int size = len >> COMP;
for (int k = (srid >> COMP); k <= size; k++)
pnums += bits[mask[k]];
return pnums;
}
int P7(int beg, int end)
{
int pnums = 0;
#if 0
for (int j = SP; beg <= Prime[j] && j > 0; j--){
if (end >= Prime[j])
pnums++;
}
#else
// 2 3 5 7 11 13 17 19
static int Prime19[20] = {0, 0, 1, 2, 2, 3, 3, 4, 4, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 8};
if (end > Prime[SP])
pnums += Prime19[Prime[SP]] - Prime19[beg - 1];
else
pnums += Prime19[end] - Prime19[beg - 1];
#endif
return pnums;
}
int PI(int beg, int end)
{
int pnums = 0;
if (beg > end || end > DEFAULT_N)
return -1;
if (beg < 2)
beg = 2;
if (beg <= Prime[SP])
pnums = P7(beg, end);
if (beg / BLOCKSIZE == end / BLOCKSIZE){
pnums += piRange(beg, end - beg + 1);
return pnums;
}
int size = end % BLOCKSIZE;
if (size != 0)
pnums += piRange(end - size, size + 1);
size = beg % BLOCKSIZE;
if (size != 0){
pnums += piRange(beg, BLOCKSIZE - size);
beg += BLOCKSIZE - size;
}
beg /= BLOCKSIZE, end /= BLOCKSIZE;
for (int i = beg; i < end; i++){
static int pcache[DEFAULT_N / BLOCKSIZE + 2] = {0};
if (pcache[i] == 0)
pcache[i] = piRange(i * BLOCKSIZE);
pnums += pcache[i];
}
return pnums;
}
void initBits()
{
int i;
int bitsize = sizeof(bits) / sizeof(bits[0]);
for (i = 1; i < bitsize; i++)
bits[i] = bits[i >> 1] + (i & 1);
for (i = 0; i < bitsize; i++)
bits[i] = 8 - bits[i];
for (i = 0; i < bitsize; i++){
int num = i, cnt = 0;
if (bits[num] > 5)
continue;
for (int j = 0; num; j++){
if ( !(num & 1) )
bitsIndex[i][cnt++] = j + 1;
num >>= 1;
}
// printf("i = %d, cnt = %d\n", i, cnt);
}
}
#include <windows.h>
int main(int arg, char **argc)
{
clock_t tstart = clock();
int pnums = 0;
initBits();
sieve(DEFAULT_N);
#ifdef MULTI_THREAD
if (DEFAULT_N > THREAD_NUM * BLOCKSIZE){
if (arg > 1)
pnums = multiThread(atoi(argc[1]), DEFAULT_N);
else
pnums = multiThread(THREAD_NUM, DEFAULT_N);
}else
pnums = PI(0, DEFAULT_N);
#else
pnums = PI(0, DEFAULT_N);
#endif
printf("PI2[0 - %u] : primes = %d, time use %ld ms\n", (uint)DEFAULT_N, pnums, clock() - tstart);
//for test the result
int beg, end;
while (scanf ("%d %d", &beg, &end) == 2 && beg <= end){
tstart = clock();
printf("PI[%d, %d] = ", beg, end);
pnums = PI(beg, end);
printf("%d, time use %d ms\n", pnums, clock() - tstart);
}
return 0;
}
#endif
int sieve(int maxn)
{
uint p, primes = 1;
uint maxp = (uint)sqrt((float)maxn) + 19;
#if (ALGORITHM == 1)
if (maxp < BLOCKSIZE)
maxp = BLOCKSIZE + 10;
#endif
Prime[1] = 2;
for (p = 3; p < maxp; p += 2){
if ( !(BaseTpl[p >> COMP] & MASKN(p)) ){
Prime[++primes] = p;
for (uint j = p * p; j < maxp; j += p << 1)
BaseTpl[j >> COMP] |= MASKN(j);
}
}
#if (COMP == 4)
memset(BaseTpl, 0, sizeof(BaseTpl));
#else
memset(BaseTpl, 0x55, sizeof(BaseTpl));
#endif
for (int i = 2; i <= SP; i++){
// printf("%d\n", Prime[i]);
for (int j = Prime[i], p = j; j < BLOCKSIZE; j += p << 1)
BaseTpl[j >> COMP] |= MASKN(j);//printf("j = %d\n", j);
}
Prime[primes + 1] = maxn;
Prime[0] = primes;
return primes;
}