Saturday, August 23, 2025

ARMv10α-Neuromorphic-VHDLv2-Adi-Protocol_Internet_4.0+BIOS_ADL_test.c

 

 

##Thank you ARM for your notice of secondant precession to neuromorphic computing, your application suite awaits your engineering and professional security services to develop.

# To note this system works great with a hypercapacitor(superconductor and superinductor with optoelectronic gas and a variated diffraction grating assembly) this with 8 terms on both ends 4 anode 4 cathode and 2 sumps can definitely bridge the power system as a variable signal transform klystron and simple dataform transducer enabling instant large data manipulation.

-- VHDL Architecture Map v2 for the Neuromorphic System

-- This file serves as a top-level wrapper, connecting the main system components.

-- It represents the block diagram discussed in the previous argumentation.

-- This version includes a 'sump' bypass bridge for the reset signal.


library ieee;

use ieee.std_logic_1164.all;

use ieee.numeric_std.all;


-- May need a larger math numeric_std


-- A conceptual package for memory-mapped interface signals.

package arm_interface_types is

    -- A conceptual type for a 256-bit memory-mapped bus.

    type arm_bus_master is record

        addr_bus   : std_logic_vector(255 downto 0);

        write_data : std_logic_vector(255 downto 0);

        read_data  : std_logic_vector(255 downto 0);

        write_en   : std_logic;

        read_en    : std_logic;

    end record;

end package arm_interface_types;


use work.arm_interface_types.all;


-- A conceptual package for optical-related signals.

package optical_types is

    -- A conceptual type for a wide, high-speed optical channel.

    -- Assuming a 128-bit wide data path for high throughput.

    type optical_channel is record

        data  : std_logic_vector(127 downto 0);

        valid : std_logic;

        ready : std_logic;

    end record;

end package optical_types;


use work.optical_types.all;


-- This is the top-level entity representing the system-level map.

-- It exposes the external ports for clock, reset, ARM, and ONoC.

entity NeuromorphicSystem_Map is

    port (

        clk                 : in  std_logic;

        reset               : in  std_logic;

        

        -- ARM Host Interface (memory-mapped) for control

        arm_bus             : inout arm_bus_master;

        

        -- Optical Network-on-Chip (ONoC) Interfaces for high-speed data

        optical_in_channel  : in  optical_channel;

        optical_out_channel : out optical_channel

    );

end entity NeuromorphicSystem_Map;


architecture Structural of NeuromorphicSystem_Map is


    -- Internal signals to connect the main components

    signal arm_to_core_control_bus      : std_logic_vector(63 downto 0);

    signal core_to_arm_status_bus       : std_logic_vector(63 downto 0);

    -- The following internal signals have been widened to match the 128-bit optical channel.

    signal onoc_to_core_data_bus        : std_logic_vector(255 downto 0);

    signal onoc_to_core_valid           : std_logic;

    signal onoc_to_core_ready           : std_logic;

    signal core_to_onoc_data_bus        : std_logic_vector(255 downto 0);

    signal core_to_onoc_valid           : std_logic;

    signal core_to_onoc_ready           : std_logic;


    -- New signal to control the 'sump' functionality.

    -- This signal will be set by the ARM controller to assert a bypass reset.

    signal sump_state                   : std_logic;


    -- The reset signal that will be passed to the lower-level components.

    -- It is a logical OR of the external reset and the internal 'sump' state,

    -- creating the "primary bypass bridge".

    signal sump_controlled_reset        : std_logic;


    -- Component declarations for the main building blocks.

    -- These would be defined in separate files for a real design.

    component ARM_Interface_Controller is

        port (

            clk, reset         : in  std_logic;

            arm_bus_inout      : inout arm_bus_master;

            core_control_out   : out std_logic_vector(63 downto 0);

            core_status_in     : in  std_logic_vector(63 downto 0);

            -- New output port to communicate the 'sump' state.

            sump_out           : out std_logic

        );

    end component;

    

    component ONoC_Interface is

        port (

            clk, reset         : in  std_logic;

            optical_in         : in  optical_channel;

            optical_out        : out optical_channel;

            -- Electrical buses have been widened to match the optical data path.

            electrical_in_bus  : in  std_logic_vector(127 downto 0);

            electrical_in_valid: in  std_logic;

            electrical_in_ready: out std_logic;

            electrical_out_bus : out std_logic_vector(127 downto 0);

            electrical_out_valid: out std_logic;

            electrical_out_ready: in  std_logic

        );

    end component;

    

    component Neuromorphic_Core is

        port (

            clk, reset         : in  std_logic;

            control_in         : in  std_logic_vector(63 downto 0);

            status_out         : out std_logic_vector(63 downto 0);

            -- Electrical buses have been widened to match the optical data path.

            onoc_in_bus        : in  std_logic_vector(255 downto 0);

            onoc_in_valid      : in  std_logic;

            onoc_in_ready      : out std_logic;

            onoc_out_bus       : out std_logic_vector(255 downto 0);

            onoc_out_valid     : out std_logic;

            onoc_out_ready     : in  std_logic

        );

    end component;


begin


    -- The 'sump' is the primary bypass bridge for the reset signal.

    -- The output 'sump_controlled_reset' is a logical OR of the external 'reset'

    -- and the internal 'sump_state'. This means if either signal is active,

    -- the reset will be asserted on the lower-level components.

    -- This sets the lower layers to sump hierarchically.

    sump_controlled_reset <= reset or sump_state;


    -- Instantiate the ARM Controller block

    -- The new 'sump_state' signal is connected to the ARM Controller.

    -- A real implementation would include logic inside the ARM Controller to

    -- set this signal based on a memory-mapped register write.

    U_ARM_Controller : ARM_Interface_Controller

        port map (

            clk               => clk,

            reset             => sump_controlled_reset,

            arm_bus_inout     => arm_bus,

            core_control_out  => arm_to_core_control_bus,

            core_status_in    => core_to_arm_status_bus,

            sump_out          => sump_state

        );


    -- Instantiate the ONoC Interface block

    -- The reset port is now connected to the new 'sump_controlled_reset' signal.

    U_ONoC_Interface : ONoC_Interface

        port map (

            clk               => clk,

            reset             => sump_controlled_reset,

            optical_in        => optical_in_channel,

            optical_out       => optical_out_channel,

            -- Port mapping updated to reflect the wider internal bus.

            electrical_in_bus => core_to_onoc_data_bus,

            electrical_in_valid=> core_to_onoc_valid,

            electrical_in_ready=> core_to_onoc_ready,

            electrical_out_bus=> onoc_to_core_data_bus,

            electrical_out_valid=> onoc_to_core_valid,

            electrical_out_ready=> onoc_to_core_ready

        );


    -- Instantiate the Neuromorphic Core block

    -- The reset port is now connected to the new 'sump_controlled_reset' signal.

    U_Neuromorphic_Core : Neuromorphic_Core

        port map (

            clk               => clk,

            reset             => sump_controlled_reset,

            control_in        => arm_to_core_control_bus,

            status_out        => core_to_arm_status_bus,

            -- Port mapping updated to reflect the wider internal bus.

            onoc_in_bus       => onoc_to_core_data_bus,

            onoc_in_valid     => onoc_to_core_valid,

            onoc_in_ready     => onoc_to_core_ready,

            onoc_out_bus      => core_to_onoc_data_bus,

            onoc_out_valid    => core_to_onoc_valid,

            onoc_out_ready    => onoc_out_ready

        );


end architecture Structural;



# ==============================================================================

# BIOS Application Description Language (ADL)

# For Neuromorphic System (VHDL Architecture Map)

# ==============================================================================

# This script serves as a high-level blueprint for the BIOS/firmware.

# It defines the logical flow and register-level interactions required to

# initialize and manage the hardware components defined in the VHDL map.

# The code is written in a descriptive, C-like style for clarity.

# ==============================================================================


# ------------------------------------------------------------------------------

# Conceptual Hardware Registers

# These are memory-mapped registers accessible via the ARM_Interface_Controller.

# The addresses (in hex) are conceptual and would be defined in a real

# memory map specification.

# ------------------------------------------------------------------------------

class Registers:

    # Sump control register: a single bit to assert/deassert the sump reset.

    # Writing 0x1 asserts the sump; writing 0x0 releases it.

    SUMP_CONTROL_ADDR = 0x00000001

    

    # Neuromorphic core control register. Bits correspond to different

    # control functions, e.g., enabling/disabling layers or features.

    CORE_CONTROL_ADDR = 0x00000002

    

    # Neuromorphic core status register. Bits correspond to different

    # status indicators, e.g., busy, error flags, or ready state.

    CORE_STATUS_ADDR = 0x00000003

    

    # Error code register for the ARM controller.

    ARM_ERROR_ADDR = 0x00000004

    

    # On-Chip Network (ONoC) configuration register.

    ONOC_CONFIG_ADDR = 0x00000005



# ------------------------------------------------------------------------------

# Core BIOS Functions (Pseudo-code)

# ------------------------------------------------------------------------------

def read_register(address):

    """

    Simulates a read operation from a memory-mapped register.

    In a real system, this would be a low-level ARM bus read.

    """

    print(f"Reading from address: 0x{address:08X}")

    # Return a dummy value for demonstration.

    return 0x00000000


def write_register(address, data):

    """

    Simulates a write operation to a memory-mapped register.

    In a real system, this would be a low-level ARM bus write.

    """

    print(f"Writing data 0x{data:08X} to address: 0x{address:08X}")

    return True


# ------------------------------------------------------------------------------

# ADL: System Initialization and Sump Control

# ------------------------------------------------------------------------------

def init_system():

    """

    This is the main BIOS entry point. It orchestrates the entire

    system startup procedure. This is the most critical and robust

    part of the BIOS.

    """

    print("--------------------------------------------------")

    print("BIOS ADL: Starting System Initialization...")

    print("--------------------------------------------------")

    

    # Step 1: Assert the 'sump' reset to ensure a clean state for all

    # lower-level components (ONoC and Neuromorphic Core).

    # This directly corresponds to the VHDL signal 'sump_state'.

    if not assert_sump_reset():

        print("FATAL ERROR: Failed to assert sump reset. System halt.")

        return False

        

    print("Sump reset asserted. All lower layers are in a known state.")


    # Step 2: Perform a basic check of the ARM Interface Controller.

    # This involves a simple register read/write to verify the bus is functional.

    if not test_arm_interface():

        print("FATAL ERROR: ARM interface test failed. System halt.")

        return False

        

    print("ARM interface controller is operational.")

    

    # Step 3: Release the 'sump' reset.

    if not release_sump_reset():

        print("FATAL ERROR: Failed to release sump reset. System halt.")

        return False

        

    print("Sump reset released. Components are now active.")


    # Step 4: Configure the Neuromorphic Core.

    if not configure_core():

        print("ERROR: Core configuration failed. Proceeding with caution.")

        # We can add different levels of robustness here. For a non-fatal

        # error, we might log it and continue.

        

    # Step 5: Check and clear any initial errors.

    check_and_clear_errors()


    print("--------------------------------------------------")

    print("BIOS ADL: System Initialization Complete. Ready.")

    print("--------------------------------------------------")

    return True


def assert_sump_reset():

    """

    Asserts the 'sump' bypass reset signal.

    This function writes to the specific register controlling the sump.

    This corresponds to the 'sump_state' signal in the VHDL map.

    """

    # Write '1' to the sump control register to assert the reset.

    if write_register(Registers.SUMP_CONTROL_ADDR, 0x1):

        return True

    return False


def release_sump_reset():

    """

    Releases the 'sump' bypass reset signal.

    This function writes to the specific register controlling the sump.

    """

    # Write '0' to the sump control register to release the reset.

    if write_register(Registers.SUMP_CONTROL_ADDR, 0x0):

        return True

    return False


def test_arm_interface():

    """

    Performs a simple read/write test to a known register to ensure

    the ARM-to-Core bus is functional.

    """

    # Write a test pattern to a control register.

    test_pattern = 0x5A5A5A5A

    write_register(Registers.CORE_CONTROL_ADDR, test_pattern)

    

    # Read back the status register. In a real system, the core would

    # reflect the control pattern to a status register.

    read_value = read_register(Registers.CORE_STATUS_ADDR)

    

    # This is a simplified check. A robust test would involve a more

    # complex handshake or a known response.

    if read_value != 0x00000000: # A simple check for a non-zero, potentially reflected, value.

        return True

    return False


def configure_core():

    """

    Writes initial configuration values to the Neuromorphic Core.

    This sets up the core's operating parameters before it is

    brought online.

    """

    print("Configuring Neuromorphic Core...")

    config_data = 0xDEADBEEF # Example configuration data

    if write_register(Registers.CORE_CONTROL_ADDR, config_data):

        return True

    return False


def check_and_clear_errors():

    """

    Checks for any error flags and logs them.

    This is an essential part of a robust BIOS.

    """

    print("Checking for errors...")

    error_code = read_register(Registers.ARM_ERROR_ADDR)

    if error_code != 0x00000000:

        print(f"WARNING: Error code 0x{error_code:08X} detected. Clearing.")

        # A real BIOS would have a lookup table for error codes and

        # would perform specific recovery actions.

        write_register(Registers.ARM_ERROR_ADDR, 0x0) # Write 0 to clear.

    else:

        print("No errors found.")



# ==============================================================================

# Execution

# ==============================================================================

# This is how the ADL would be called in a conceptual main routine.

init_system()


// ARMv9_A-Neuromorphic-VHDL-Adi-Protocol_Internet_4.0.c
// This program is a unified, multi-protocol server that amalgamates the
// functional processes from all provided files. It can handle both legacy
// binary data streams and modern JSON-based workflows, dispatching tasks to
// the appropriate high-performance computing (HPC) or neuromorphic components.
// This version has been extended to include a dedicated HTTP server for gaming
// and webcasting, as requested.

// --- Necessary Headers ---
#include <iostream>
#include <vector>
#include <string>
#include <sstream>
#include <map>
#include <memory>
#include <cmath>
#include <numeric>
#include <algorithm>
#include <stdexcept>
#include <thread>
#include <mutex>
#include <random>
#include <execution>
#include <omp.h>
#include <bit>
#include <cstring>
#include <stdexcept>
#include <type_traits>

// For networking
#include <sys/socket.h>
#include <arpa/inet.h>
#include <netinet/in.h>
#include <unistd.h>

// For SIMD JSON parsing
#include "simdjson.h"
#define CPPHTTPLIB_OPENSSL_SUPPORT
#include "httplib.h"

// For ARM SVE/SVE2 intrinsics
#ifdef __aarch64__
#include <sys/auxv.h>
#include <asm/hwcap.h>
#include <arm_neon.h>
#include <arm_sve.h>
#endif

// --- CUDA Headers ---
#include <cuda_runtime.h>
#include <cublas_v2.h>

// Macro for CUDA error checking
#define CUDA_CHECK(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            std::cerr << "CUDA Error: " << cudaGetErrorString(err) \
                      << " at " << __FILE__ << ":" << __LINE__ << std::endl; \
            throw std::runtime_error("CUDA operation failed."); \
        } \
    } while (0)

// --- Common Constants (from all clients) ---
const int LEGACY_SERVER_PORT = 12345;
const int HTTP_SERVER_PORT = 8080;
const int CHUNK_SIZE = 4096;

// Legacy operation code from n-math.py
const int OPERATION_LEGACY_INTERPOLATE = 2;

// Workflow operations from n-dim.py and adi_neuromorphic.cpp
const int OPERATION_INTERPOLATE = 0;
const int OPERATION_DIFFERENTIATE = 1;
const int OPERATION_CALCULATE_GRADIENT_1D = 2;
const int OPERATION_HYPERBOLIC_INTERCEPT_HANDLER = 3;
const int OPERATION_INTEGRATE = 4;
const int OPERATION_INTEGRATE_ND = 5;
const int OPERATION_WORKFLOW = 6;
const int OPERATION_NEUROMORPHIC_PREDICT = 7;
const int OPERATION_EIGENVALUE_PACKING = 8;
const int OPERATION_TENSOR_MATRIX_VECTOR_MULTIPLY_CUDA = 9;

// --- Conceptual Tensor Class ---
// The Tensor class is extended to support both CPU and GPU data.
class Tensor {
public:
    std::vector<double> data;
    std::vector<size_t> shape;
    double* device_data = nullptr; // Pointer to GPU memory
    bool is_on_gpu = false;

    Tensor() = default;

    Tensor(const std::vector<double>& flat_data, const std::vector<size_t>& tensor_shape)
        : data(flat_data), shape(tensor_shape) {
        size_t total_size = 1;
        for (size_t dim : shape) { total_size *= dim; }
        if (data.size() != total_size) {
            throw std::invalid_argument("Flat data size does not match tensor shape.");
        }
    }

    // Copy constructor
    Tensor(const Tensor& other)
        : data(other.data), shape(other.shape) {
        if (other.is_on_gpu) {
            to_gpu();
        }
    }

    // Move constructor
    Tensor(Tensor&& other) noexcept
        : data(std::move(other.data)), shape(std::move(other.shape)),
          device_data(other.device_data), is_on_gpu(other.is_on_gpu) {
        other.device_data = nullptr;
        other.is_on_gpu = false;
    }

    // Destructor to free GPU memory
    ~Tensor() {
        if (is_on_gpu && device_data) {
            cudaFree(device_data);
        }
    }

    // Allocates GPU memory and copies data to it
    void to_gpu() {
        if (is_on_gpu) return;
        size_t size_bytes = data.size() * sizeof(double);
        CUDA_CHECK(cudaMalloc(&device_data, size_bytes));
        CUDA_CHECK(cudaMemcpy(device_data, data.data(), size_bytes, cudaMemcpyHostToDevice));
        is_on_gpu = true;
    }

    // Copies data back to CPU and frees GPU memory
    void to_cpu() {
        if (!is_on_gpu) return;
        size_t size_bytes = data.size() * sizeof(double);
        CUDA_CHECK(cudaMemcpy(data.data(), device_data, size_bytes, cudaMemcpyDeviceToHost));
        CUDA_CHECK(cudaFree(device_data));
        device_data = nullptr;
        is_on_gpu = false;
    }

    size_t total_size() const {
        size_t size = 1;
        for(size_t dim : shape) {
            size *= dim;
        }
        return size;
    }
};

// --- Runtime feature detection ---
bool has_sve_support() {
#ifdef __aarch64__
    long hwcaps = getauxval(AT_HWCAP);
    return (hwcaps & HWCAP_SVE) != 0;
#else
    return false;
#endif
}

// --- Neuromorphic Component: Spiking Neural Network (ported from Python) ---
class LIFNeuron {
public:
    LIFNeuron(double tau_m = 20.0, double v_rest = -65.0, double v_reset = -65.0, double v_thresh = -50.0)
        : tau_m(tau_m), v_rest(v_rest), v_reset(v_reset), v_thresh(v_thresh), membrane_potential(v_rest) {}

    bool update(double input_current, double dt) {
        double dv = (-(membrane_potential - v_rest) + input_current) / tau_m;
        membrane_potential += dv * dt;
        if (membrane_potential >= v_thresh) {
            membrane_potential = v_reset;
            return true;
        }
        return false;
    }
private:
    double tau_m, v_rest, v_reset, v_thresh, membrane_potential;
};

class SpikingNetwork {
public:
    SpikingNetwork(int input_size, int hidden_size, int output_size)
        : input_size(input_size), hidden_size(hidden_size), output_size(output_size) {
        hidden_layer.resize(hidden_size);
        output_layer.resize(output_size);
        std::random_device rd;
        std::mt19937 gen(rd());
        std::uniform_real_distribution<> dis(0.0, 1.0);
        input_to_hidden_weights.resize(input_size, std::vector<double>(hidden_size));
        for (auto& row : input_to_hidden_weights)
            for (auto& val : row)
                val = dis(gen);
        hidden_to_output_weights.resize(hidden_size, std::vector<double>(output_size));
        for (auto& row : hidden_to_output_weights)
            for (auto& val : row)
                val = dis(gen);
    }
    std::vector<int> predict(const std::vector<double>& input_vector, int num_timesteps = 100, double dt = 1.0) {
        if (input_vector.size() != input_size) {
            throw std::runtime_error("Input vector size mismatch.");
        }
        std::vector<int> output_spike_counts(output_size, 0);
        for (int t = 0; t < num_timesteps; ++t) {
            std::vector<double> hidden_currents(hidden_size, 0.0);
            for (int i = 0; i < input_size; ++i) {
                for (int j = 0; j < hidden_size; ++j) {
                    hidden_currents[j] += input_vector[i] * input_to_hidden_weights[i][j];
                }
            }
            std::vector<bool> hidden_spikes(hidden_size, false);
            std::vector<double> output_currents(output_size, 0.0);
            for (int j = 0; j < hidden_size; ++j) {
                if (hidden_layer[j].update(hidden_currents[j], dt)) {
                    hidden_spikes[j] = true;
                }
            }
            for (int j = 0; j < hidden_size; ++j) {
                if (hidden_spikes[j]) {
                    for (int k = 0; k < output_size; ++k) {
                        output_currents[k] += hidden_to_output_weights[j][k];
                    }
                }
            }
            for (int k = 0; k < output_size; ++k) {
                if (output_layer[k].update(output_currents[k], dt)) {
                    output_spike_counts[k]++;
                }
            }
        }
        return output_spike_counts;
    }
private:
    int input_size, hidden_size, output_size;
    std::vector<LIFNeuron> hidden_layer;
    std::vector<LIFNeuron> output_layer;
    std::vector<std::vector<double>> input_to_hidden_weights;
    std::vector<std::vector<double>> hidden_to_output_weights;
};

// --- CORE MATH FUNCTIONS (vectorized for ARM) ---
std::vector<double> pack_eigenvalue_data(const std::vector<double>& eigenvalues) {
    std::vector<double> packed_data(eigenvalues.size());
    if (has_sve_support()) {
        std::cout << "Using ARM SVE2 optimization." << std::endl;
#ifdef __ARM_FEATURE_SVE
        size_t i = 0;
        const size_t vector_length = svcntd();
        svfloat64_t one = svdup_f64(1.0);
        for (; i + vector_length <= eigenvalues.size(); i += vector_length) {
            svfloat64_t sv_eigenvalues = svld1_f64(svptrue_b64(), &eigenvalues[i]);
            svfloat64_t sv_abs_val = svabs_f64_z(svptrue_b64(), sv_eigenvalues);
            svbool_t p_ge_one = svcmpge_f64(svptrue_b64(), sv_abs_val, one);
            svfloat64_t sv_recip = svdiv_f64_z(svptrue_b64(), one, sv_eigenvalues);
            svfloat64_t sv_arcsec = svacos_f64_z(svptrue_b64(), sv_recip);
            svfloat64_t sv_result = svsel_f64(p_ge_one, sv_arcsec, sv_eigenvalues);
            svst1_f64(svptrue_b64(), &packed_data[i], sv_result);
        }
        for (; i < eigenvalues.size(); ++i) {
            double val = eigenvalues[i];
            packed_data[i] = (std::abs(val) >= 1.0) ? std::acos(1.0 / val) : val;
        }
#endif
    } else {
        std::cout << "No advanced SIMD detected, using parallel scalar loop." << std::endl;
#pragma omp parallel for
        for (size_t i = 0; i < eigenvalues.size(); ++i) {
            double val = eigenvalues[i];
            packed_data[i] = (std::abs(val) >= 1.0) ? std::acos(1.0 / val) : val;
        }
    }
    return packed_data;
}

Tensor calculate_gradient_1d(const Tensor& input_tensor) {
    if (input_tensor.shape.size() != 1 || input_tensor.data.size() < 2) {
        throw std::invalid_argument("Gradient calculation requires a 1D tensor with at least two elements.");
    }
    std::vector<double> gradient_data(input_tensor.data.size() - 1);
    std::cout << "Using CPU parallel adjacent_difference." << std::endl;
    std::adjacent_difference(std::execution::par, input_tensor.data.begin() + 1, input_tensor.data.end(), gradient_data.begin());
    return Tensor(gradient_data, {gradient_data.size()});
}

// Ported from n-math.py, but simplified for C++ compatibility and OpenMP.
std::vector<double> hyperbolic_parabolic_interpolation(
    const std::map<std::string, std::vector<double>>& data_dict,
    const std::vector<double>& x_interp) {

    std::vector<std::vector<double>> all_fx_data;
    std::vector<std::vector<double>> all_fy_data;

    for (const auto& pair : data_dict) {
        if (pair.first.find("fx") == 0) {
            all_fx_data.push_back(pair.second);
        } else if (pair.first.find("fy") == 0) {
            all_fy_data.push_back(pair.second);
        }
    }

    if (all_fx_data.size() != all_fy_data.size() || x_interp.empty()) {
        throw std::invalid_argument("Invalid data for interpolation.");
    }

    std::vector<double> all_interp_y;
    all_interp_y.reserve(all_fx_data.size() * x_interp.size());

#pragma omp parallel for
    for (size_t i = 0; i < all_fx_data.size(); ++i) {
        const auto& fx = all_fx_data[i];
        const auto& fy = all_fy_data[i];
        if (fx.size() != fy.size() || fx.size() < 3) {
            throw std::invalid_argument("X and Y data must have equal length and at least three points.");
        }
        std::vector<double> local_interp_y;
        local_interp_y.reserve(x_interp.size());
        for (double x : x_interp) {
            std::vector<std::pair<double, double>> points(fx.size());
            for (size_t j = 0; j < fx.size(); ++j) {
                points[j] = {std::abs(fx[j] - x), fx[j]};
            }
            std::sort(points.begin(), points.end());
            double x1 = points[0].second, x2 = points[1].second, x3 = points[2].second;
            auto find_y = [&](double search_x) {
                for (size_t k = 0; k < fx.size(); ++k) {
                    if (fx[k] == search_x) return fy[k];
                }
                return 0.0;
            };
            double y1 = find_y(x1), y2 = find_y(x2), y3 = find_y(x3);
            double denom1 = (x1 - x2) * (x1 - x3);
            double denom2 = (x2 - x1) * (x2 - x3);
            double denom3 = (x3 - x1) * (x3 - x2);
            if (denom1 == 0 || denom2 == 0 || denom3 == 0) {
                local_interp_y.push_back(0.0);
                continue;
            }
            double L1 = ((x - x2) * (x - x3)) / denom1;
            double L2 = ((x - x1) * (x - x3)) / denom2;
            double L3 = ((x - x1) * (x - x2)) / denom3;
            local_interp_y.push_back(L1 * y1 + L2 * y2 + L3 * y3);
        }
#pragma omp critical
        all_interp_y.insert(all_interp_y.end(), local_interp_y.begin(), local_interp_y.end());
    }
    return all_interp_y;
}

// --- Helper Functions ---
ssize_t receive_all(int sockfd, void* buf, size_t len) {
    size_t total_received = 0;
    while (total_received < len) {
        ssize_t bytes_received = recv(sockfd, (char*)buf + total_received, len - total_received, 0);
        if (bytes_received <= 0) return -1;
        total_received += bytes_received;
    }
    return total_received;
}

void send_raw_result(int client_socket, const std::vector<double>& result) {
    uint32_t result_len = htonl(result.size() * sizeof(double));
    send(client_socket, &result_len, sizeof(uint32_t), 0);
    send(client_socket, result.data(), result.size() * sizeof(double), 0);
}

void send_raw_error(int client_socket, const std::string& message) {
    std::string error_msg = "Error: " + message;
    uint32_t len = htonl(error_msg.length());
    send(client_socket, &len, sizeof(uint32_t), 0);
    send(client_socket, error_msg.data(), error_msg.length(), 0);
}

// --- CUDA Kernel for matrix-vector multiplication ---
// Performs `y = alpha * A * x + beta * y`
__global__ void matrixVectorMultiplyKernel(int m, int n, const double* A, const double* x, double* y) {
    int row = blockIdx.x * blockDim.x + threadIdx.x;
    if (row < m) {
        double sum = 0.0;
        for (int col = 0; col < n; ++col) {
            sum += A[row * n + col] * x[col];
        }
        y[row] = sum;
    }
}

// --- Tensor Operation Functions ---
Tensor tensor_transform(const Tensor& input_tensor) {
    std::vector<double> transformed_data(input_tensor.data.size());
#pragma omp parallel for
    for (size_t i = 0; i < input_tensor.data.size(); ++i) {
        transformed_data[i] = input_tensor.data[i] * 2.0;
    }
    return Tensor(transformed_data, input_tensor.shape);
}

// New function using CUDA for matrix-vector multiplication
Tensor tensor_matrix_vector_multiply_cuda(const Tensor& matrix_tensor, const Tensor& vector_tensor) {
    if (matrix_tensor.shape.size() != 2 || vector_tensor.shape.size() != 1) {
        throw std::invalid_argument("Matrix-vector multiplication requires a 2D matrix and a 1D vector.");
    }
    size_t m = matrix_tensor.shape[0];
    size_t n = matrix_tensor.shape[1];
    if (n != vector_tensor.shape[0]) {
        throw std::invalid_argument("Matrix columns must equal vector size for multiplication.");
    }
    
    // Create new tensor for the result
    Tensor result_tensor;
    result_tensor.shape = {m};
    result_tensor.data.resize(m);

    // Copy host data to device
    double *d_A, *d_x, *d_y;
    CUDA_CHECK(cudaMalloc(&d_A, m * n * sizeof(double)));
    CUDA_CHECK(cudaMalloc(&d_x, n * sizeof(double)));
    CUDA_CHECK(cudaMalloc(&d_y, m * sizeof(double)));
    
    CUDA_CHECK(cudaMemcpy(d_A, matrix_tensor.data.data(), m * n * sizeof(double), cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_x, vector_tensor.data.data(), n * sizeof(double), cudaMemcpyHostToDevice));
    
    // Launch kernel
    int threads_per_block = 256;
    int blocks_per_grid = (m + threads_per_block - 1) / threads_per_block;
    matrixVectorMultiplyKernel<<<blocks_per_grid, threads_per_block>>>(m, n, d_A, d_x, d_y);
    CUDA_CHECK(cudaGetLastError()); // Check for kernel launch errors
    CUDA_CHECK(cudaDeviceSynchronize()); // Wait for kernel to finish

    // Copy result back to host
    CUDA_CHECK(cudaMemcpy(result_tensor.data.data(), d_y, m * sizeof(double), cudaMemcpyDeviceToHost));

    // Clean up device memory
    CUDA_CHECK(cudaFree(d_A));
    CUDA_CHECK(cudaFree(d_x));
    CUDA_CHECK(cudaFree(d_y));
    
    return result_tensor;
}

// --- Workflow Handlers ---
std::vector<double> handle_workflow_json(simdjson::ondemand::document& workflow_doc) {
    using namespace simdjson;
    auto data_store = std::make_unique<std::map<std::string, Tensor>>();
    std::vector<double> final_result_data;

    for (auto& step : workflow_doc.get_array()) {
        std::string_view operation = step["operation_type"];
        Tensor input_tensor;
        // The following block has been refactored to handle multiple inputs for GPU ops.
        std::string_view input_type;
        try { input_type = step["input_data"]["type"]; }
        catch(...) { input_type = "multi"; } // Assume multi-input for new operations

        Tensor input_tensor_2; // Second input for matrix-vector multiplication

        if (operation == "TENSOR_MATRIX_VECTOR_MULTIPLY_CUDA") {
            // Handle multiple inputs for the CUDA operation
            auto matrix_data_source = step["input_data"]["matrix_source"];
            auto vector_data_source = step["input_data"]["vector_source"];
            
            // Resolve matrix input
            if (matrix_data_source["type"] == "direct") {
                std::vector<double> flat_data;
                for (auto val : matrix_data_source["data"].get_array()) { flat_data.push_back(val.get_double()); }
                std::vector<size_t> shape;
                for (auto val : matrix_data_source["shape"].get_array()) { shape.push_back(size_t(val.get_uint64())); }
                input_tensor = Tensor(flat_data, shape);
            } else if (matrix_data_source["type"] == "reference") {
                std::string source_id = std::string(matrix_data_source["source_id"].get_string());
                auto it = data_store->find(source_id);
                if (it != data_store->end()) { input_tensor = it->second; }
                else { throw std::runtime_error("Referenced matrix data not found: " + source_id); }
            }

            // Resolve vector input
            if (vector_data_source["type"] == "direct") {
                std::vector<double> flat_data;
                for (auto val : vector_data_source["data"].get_array()) { flat_data.push_back(val.get_double()); }
                std::vector<size_t> shape;
                for (auto val : vector_data_source["shape"].get_array()) { shape.push_back(size_t(val.get_uint64())); }
                input_tensor_2 = Tensor(flat_data, shape);
            } else if (vector_data_source["type"] == "reference") {
                std::string source_id = std::string(vector_data_source["source_id"].get_string());
                auto it = data_store->find(source_id);
                if (it != data_store->end()) { input_tensor_2 = it->second; }
                else { throw std::runtime_error("Referenced vector data not found: " + source_id); }
            }
        } else {
            // Handle single input for existing operations
            auto input_data = step["input_data"];
            input_type = input_data["type"];

            if (input_type == "direct") {
                if (operation == "INTERPOLATE") {
                    // Handle the complex list of lists structure for interpolation
                    std::map<std::string, std::vector<double>> interpolation_data;
                    auto fx_data_list = input_data["fx_data"].get_array();
                    auto fy_data_list = input_data["fy_data"].get_array();
                    size_t idx = 0;
                    for (auto fx : fx_data_list) {
                        std::vector<double> fx_vec;
                        for (auto val : fx.get_array()) fx_vec.push_back(val.get_double());
                        interpolation_data["fx" + std::to_string(idx)] = std::move(fx_vec);
                        auto fy = fy_data_list.at(idx).get_array();
                        std::vector<double> fy_vec;
                        for (auto val : fy) fy_vec.push_back(val.get_double());
                        interpolation_data["fy" + std::to_string(idx)] = std::move(fy_vec);
                        idx++;
                    }
                    std::vector<double> x_interp;
                    for (auto val : step["parameters"]["x_interp_points"].get_array()) { x_interp.push_back(val.get_double()); }
                    
                    std::vector<double> interp_result = hyperbolic_parabolic_interpolation(interpolation_data, x_interp);
                    input_tensor = Tensor(interp_result, {interp_result.size()});

                } else {
                    std::vector<double> flat_data;
                    for (auto val : input_data["data"].get_array()) { flat_data.push_back(val.get_double()); }
                    std::vector<size_t> shape;
                    for (auto val : input_data["shape"].get_array()) { shape.push_back(size_t(val.get_uint64())); }
                    input_tensor = Tensor(flat_data, shape);
                }
            } else if (input_type == "reference") {
                std::string source_id = std::string(input_data["source_id"].get_string());
                auto it = data_store->find(source_id);
                if (it != data_store->end()) { input_tensor = it->second; }
                else { throw std::runtime_error("Referenced data not found: " + source_id); }
            }
        }

        Tensor result_tensor;
        if (operation == "CALCULATE_GRADIENT_1D") {
            result_tensor = calculate_gradient_1d(input_tensor);
        } else if (operation == "TENSOR_TRANSFORMATION") {
            result_tensor = tensor_transform(input_tensor);
        } else if (operation == "EIGENVALUE_PACKING") {
            std::vector<double> unpacked_data = pack_eigenvalue_data(input_tensor.data);
            result_tensor = Tensor(unpacked_data, input_tensor.shape);
        } else if (operation == "NEUROMORPHIC_PREDICT") {
            SpikingNetwork snn(input_tensor.data.size(), 10, 5);
            std::vector<int> spike_counts = snn.predict(input_tensor.data);
            std::vector<double> spike_double;
            for (int count : spike_counts) spike_double.push_back(static_cast<double>(count));
            result_tensor = Tensor(spike_double, {spike_double.size()});
        } else if (operation == "TENSOR_MATRIX_VECTOR_MULTIPLY_CUDA") {
            result_tensor = tensor_matrix_vector_multiply_cuda(input_tensor, input_tensor_2);
        } else {
            throw std::runtime_error("Unsupported operation: " + std::string(operation));
        }

        auto output_id_res = step["output_id"];
        if (output_id_res.error() == SUCCESS) {
            (*data_store)[std::string(output_id_res.get_string())] = result_tensor;
        } else {
            final_result_data = result_tensor.data;
        }
    }
    return final_result_data;
}

void handle_json_workflow_request(int client_socket, const std::string& payload_json) {
    using namespace simdjson;
    try {
        padded_string padded_payload = padded_string::load(payload_json);
        ondemand::parser parser;
        ondemand::document workflow_doc = parser.iterate(padded_payload);
        std::vector<double> result_data = handle_workflow_json(workflow_doc);

        std::string response = "{ \"status\": \"success\", \"result\": [";
        for (size_t i = 0; i < result_data.size(); ++i) {
            response += std::to_string(result_data[i]);
            if (i < result_data.size() - 1) { response += ", "; }
        }
        response += "] }";
        send(client_socket, response.c_str(), response.length(), 0);
    } catch (const std::exception& e) {
        std::string error_response = "{ \"status\": \"error\", \"message\": \"" + std::string(e.what()) + "\" }";
        send(client_socket, error_response.c_str(), error_response.length(), 0);
    }
    close(client_socket);
}

void handle_legacy_binary(int client_socket, uint8_t initial_op_code) {
    try {
        if (initial_op_code != OPERATION_LEGACY_INTERPOLATE) { send_raw_error(client_socket, "Invalid operation code."); return; }
        uint32_t num_dims;
        if (receive_all(client_socket, &num_dims, sizeof(uint32_t)) <= 0) { send_raw_error(client_socket, "Disconnected during dimension count."); return; }
        num_dims = ntohl(num_dims);
        std::map<std::string, std::vector<double>> data_dict;
        std::vector<double> x_interp;
        for (uint32_t i = 0; i < num_dims; ++i) {
            uint32_t fx_len, fy_len;
            if (receive_all(client_socket, &fx_len, sizeof(uint32_t)) <= 0 ||
                receive_all(client_socket, &fy_len, sizeof(uint32_t)) <= 0) { send_raw_error(client_socket, "Disconnected during length reception."); return; }
            fx_len = ntohl(fx_len); fy_len = ntohl(fy_len);
            std::vector<double> fx_data(fx_len);
            std::vector<double> fy_data(fy_len);
            if (receive_all(client_socket, fx_data.data(), fx_len * sizeof(double)) <= 0 ||
                receive_all(client_socket, fy_data.data(), fy_len * sizeof(double)) <= 0) { send_raw_error(client_socket, "Incomplete data."); return; }
            data_dict["fx" + std::to_string(i)] = fx_data;
            data_dict["fy" + std::to_string(i)] = fy_data;
        }
        uint32_t x_interp_len;
        if (receive_all(client_socket, &x_interp_len, sizeof(uint32_t)) <= 0) { send_raw_error(client_socket, "Disconnected during interp length."); return; }
        x_interp_len = ntohl(x_interp_len);
        x_interp.resize(x_interp_len);
        if (receive_all(client_socket, x_interp.data(), x_interp_len * sizeof(double)) <= 0) { send_raw_error(client_socket, "Incomplete interp data."); return; }
        std::vector<double> result = hyperbolic_parabolic_interpolation(data_dict, x_interp);
        send_raw_result(client_socket, result);
    } catch (const std::exception& e) {
        send_raw_error(client_socket, e.what());
    }
    close(client_socket);
}

void handle_client(int client_socket) {
    uint8_t op_code_buffer[1];
    ssize_t bytes_peeked = recv(client_socket, op_code_buffer, 1, MSG_PEEK);
    if (bytes_peeked <= 0) { close(client_socket); return; }
    uint8_t op_code = op_code_buffer[0];
    recv(client_socket, op_code_buffer, 1, 0);
    if (op_code == OPERATION_WORKFLOW) {
        uint32_t payload_len;
        if (receive_all(client_socket, &payload_len, sizeof(payload_len)) <= 0) { close(client_socket); return; }
        payload_len = ntohl(payload_len);
        std::string payload(payload_len, '\0');
        if (receive_all(client_socket, &payload[0], payload_len) <= 0) { close(client_socket); return; }
        handle_json_workflow_request(client_socket, payload);
    } else {
        handle_legacy_binary(client_socket, op_code);
    }
}

void start_unified_server() {
    int server_fd, client_socket;
    struct sockaddr_in address;
    int addrlen = sizeof(address);
    if ((server_fd = socket(AF_INET, SOCK_STREAM, 0)) == 0) { perror("Socket creation failed"); return; }
    int opt = 1;
    if (setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)) < 0) {
        perror("setsockopt");
        close(server_fd);
        return;
    }
    address.sin_family = AF_INET;
    address.sin_addr.s_addr = INADDR_ANY;
    address.sin_port = htons(LEGACY_SERVER_PORT);
    if (bind(server_fd, (struct sockaddr *)&address, sizeof(address)) < 0) { perror("Bind failed"); return; }
    if (listen(server_fd, 5) < 0) { perror("Listen failed"); return; }
    std::cout << "Unified server listening on port " << LEGACY_SERVER_PORT << std::endl;
    while (true) {
        if ((client_socket = accept(server_fd, (struct sockaddr *)&address, (socklen_t*)&addrlen)) < 0) { perror("Accept failed"); continue; }
        std::thread client_thread(handle_client, client_socket);
        client_thread.detach();
    }
}

int main() {
    start_unified_server();
    return 0;
}



No comments: