Yes, You Can Do That in C!
But you might not like it.
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <stdbool.h>
#include <time.h>
#include <pthread.h>
#include <unistd.h> // For sysconf (on POSIX systems)
#define ARR_SIZE 1000
bool is_prime_slow_c(int32_t n) {
if (n < 2) return false;
for (int32_t i = 2; i < n / 2; i++) {
if (n % i == 0) return false;
}
return true;
}
// Structure to hold thread-specific data
typedef struct {
int32_t *arr; // Pointer to the array
int start; // Start index (inclusive)
int end; // End index (exclusive)
long partial_sum; // This thread's count of primes
} ThreadData;
// Thread function: count how many numbers in [start, end) are prime
void *thread_func(void *arg) {
ThreadData *data = (ThreadData *)arg;
long local_count = 0;
for (int i = data->start; i < data->end; i++) {
if (is_prime_slow_c(data->arr[i])) {
local_count++;
}
}
// Store result in the struct
data->partial_sum = local_count;
return NULL;
}
int main() {
// Start timing
time_t start_time = time(NULL);
// Allocate memory for the array
printf("Allocating memory for the array...\n");
int32_t *arr = (int32_t *)malloc(ARR_SIZE * sizeof(int32_t));
// Populate the array with random numbers
printf("Populating the array with random numbers...\n");
for (int32_t i = 0; i < ARR_SIZE; i++) {
arr[i] = rand();
}
// -----------------------------------------------------------
// Pthread-based parallel prime counting
// -----------------------------------------------------------
// 1. Determine the number of CPUs
int num_cpus = (int)sysconf(_SC_NPROCESSORS_ONLN);
if (num_cpus < 1) {
fprintf(stderr, "Could not determine number of CPUs; defaulting to 1.\n");
num_cpus = 1;
}
// 2. Create arrays to hold thread data and pthread handles
ThreadData *thread_data = (ThreadData *)malloc(num_cpus * sizeof(ThreadData));
pthread_t *threads = (pthread_t *)malloc(num_cpus * sizeof(pthread_t));
// Calculate how many elements per thread
int chunk_size = ARR_SIZE / num_cpus;
// 3. Initialize per-thread data and create threads
for (int i = 0; i < num_cpus; i++) {
thread_data[i].arr = arr;
thread_data[i].start = i * chunk_size;
// Last chunk might take the "remainder" if ARR_SIZE not perfectly divisible
thread_data[i].end = (i == num_cpus - 1) ? ARR_SIZE : (i + 1) * chunk_size;
thread_data[i].partial_sum = 0;
pthread_create(&threads[i], NULL, thread_func, &thread_data[i]);
}
// 4. Join threads and accumulate partial sums
long sum = 0;
for (int i = 0; i < num_cpus; i++) {
pthread_join(threads[i], NULL);
sum += thread_data[i].partial_sum;
}
// 5. Clean up
free(thread_data);
free(threads);
// End timing
time_t end_time = time(NULL);
printf("Count (from C, Parallel): %ld. Seconds: %ld\n", sum, (long)(end_time - start_time));
// Free the allocated memory
free(arr);
return 0;
}
On my office system, this completes in 6-7 seconds - slightly slower than Rayon. And what a lot of work that was!