diff --git a/examples/c++/risc0/CMakeLists.txt b/examples/c++/risc0/CMakeLists.txt
new file mode 100644
index 00000000..b7710214
--- /dev/null
+++ b/examples/c++/risc0/CMakeLists.txt
@@ -0,0 +1,28 @@
+cmake_minimum_required(VERSION 3.18)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CUDA_STANDARD 17)
+set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
+set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
+if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
+    set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
+else()
+    set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
+endif ()
+project(example LANGUAGES CUDA CXX)
+
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr -DFIELD_ID=1001")
+# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
+set(CMAKE_CUDA_FLAGS_RELEASE "")
+set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
+
+add_executable(
+  example
+  example.cu
+)
+
+set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+target_include_directories(example PRIVATE "../../../icicle/include")
+
+# can link to another curve/field by changing the following lib and FIELD_ID
+target_link_libraries(example ${CMAKE_SOURCE_DIR}/build/icicle/lib/libingo_field_babybear.a)
+# target_compile_definitions(example PUBLIC FIELD_ID babybear)
\ No newline at end of file
diff --git a/examples/c++/risc0/README.md b/examples/c++/risc0/README.md
new file mode 100644
index 00000000..743d003d
--- /dev/null
+++ b/examples/c++/risc0/README.md
@@ -0,0 +1,44 @@
+# ICICLE example: RISC0's Fibonacci sequence proof using Polynomial API
+
+## Why RISC0?
+
+[RISC0 Protocol](https://www.risczero.com/) creates computational integrity proofs (a.k.a. Zero Knowledge Proofs) for programs executing on RISC-V architecture.
+The proofs are created for sequences of values in RISC-V registers, called execution traces.
+This approach is transparent to developers and enables the use of general purpose languages.
+
+## Best-Practices
+
+This example builds on [ICICLE Polynomial API](../polynomial-api/README.md) so we recommend to run it first.
+
+## Key-Takeaway
+
+RISC0 encodes execution traces into very large polynomials and commits them using Merkle trees.
+FRI speeds-up validation of such commitments by recursively generating smaller polynomials (and trees) from larger ones.
+The key enabler for *recursion* is the *redundancy* of polynomial commitments, hence the use of Reed-Solomon codes.
+
+## Running the example
+
+To run example, from project root directory:
+
+```sh
+cd examples/c++/risc0
+./compile.sh
+./run.sh
+```
+
+## What's in the example
+
+The example follows [STARK by Hand](https://dev.risczero.com/proof-system/stark-by-hand), structured in the following Lessons:
+
+1. The Execution Trace
+2. Rule checks to validate a computation
+3. Padding the Trace
+4. Constructing Trace Polynomials
+5. ZK Commitments of the Trace Data
+6. Constraint Polynomials
+7. Mixing Constraint Polynomials
+8. The Core of the RISC Zero STARK
+9. The DEEP Technique
+10. Mixing (Batching) for FRI
+11. FRI Protocol (Commit Phase)
+12. FRI Protocol (Query Phase)
diff --git a/examples/c++/risc0/compile.sh b/examples/c++/risc0/compile.sh
new file mode 100755
index 00000000..43392572
--- /dev/null
+++ b/examples/c++/risc0/compile.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+# Exit immediately on error
+set -e
+
+mkdir -p build/example
+mkdir -p build/icicle
+
+# Configure and build Icicle
+cmake -S ../../../icicle/ -B build/icicle -DCMAKE_BUILD_TYPE=Release -DFIELD=babybear
+cmake --build build/icicle
+
+# Configure and build the example application
+cmake -S . -B build/example
+cmake --build build/example
\ No newline at end of file
diff --git a/examples/c++/risc0/example.cu b/examples/c++/risc0/example.cu
new file mode 100644
index 00000000..e07128e8
--- /dev/null
+++ b/examples/c++/risc0/example.cu
@@ -0,0 +1,275 @@
+
+#include <iostream>
+#include <memory>
+#include <vector>
+#include <list>
+
+#include "polynomials/polynomials.h"
+#include "polynomials/cuda_backend/polynomial_cuda_backend.cuh"
+#include "ntt/ntt.cuh"
+
+using namespace polynomials;
+
+// define the polynomial type
+typedef Polynomial<scalar_t> Polynomial_t;
+
+// RISC-V register type
+typedef int64_t rv_t;
+
+// Convert RISC-V registers to Finite Fields
+void to_ff(rv_t* rv, scalar_t* s, size_t n) {
+  for (int i = 0; i < n; ++i) {
+    s[i] = scalar_t::from(rv[i]);
+  }
+}
+
+void p_print(Polynomial_t * p, int logn, scalar_t shift, std::string header = "Print Vector") {
+  std::cout << header << std::endl;
+  auto n = 1 << logn;
+  auto omega = scalar_t::omega(logn);
+  auto x = shift;
+  for (int i = 0; i < n; ++i) {
+    std::cout << i << ": " << (*p)(x) << std::endl;
+    x = x*omega;
+  }
+}
+
+// value to polynomial
+Polynomial_t p_value(scalar_t value) {
+  auto p_value = Polynomial_t::from_coefficients(&value , 1);
+  return p_value;
+}
+
+Polynomial_t p_rotate(Polynomial_t* p, int logn) {
+  // rotate polynomial coefficients right by one position
+  auto n = 1 << logn;
+  auto evaluations_rou_domain = std::make_unique<scalar_t[]>(n);
+  p->evaluate_on_rou_domain(logn, evaluations_rou_domain.get() );
+  scalar_t tmp  = evaluations_rou_domain[n-1];
+  for (int i = n-1; i > 0; --i) {
+    evaluations_rou_domain[i] = evaluations_rou_domain[i-1];
+  }
+  evaluations_rou_domain[0] = tmp;
+  return Polynomial_t::from_rou_evaluations(evaluations_rou_domain.get(), n); 
+}
+
+// mix polynomials (c.f. mix polynomial evaluations)
+Polynomial_t p_mix(Polynomial_t* in[], size_t nmix, scalar_t mix_parameter) {
+  scalar_t factor = mix_parameter;
+  Polynomial_t out = in[0]->clone();
+  for (int i = 1; i < nmix; ++i) {
+    out += factor * (*in[i]);
+    factor = factor * mix_parameter;
+  }
+  return out;
+}
+
+void solve_linear(scalar_t xa, scalar_t ya, scalar_t xb, scalar_t yb, scalar_t * coeffs) {
+  coeffs[1] = (ya - yb) * scalar_t::inverse(xa - xb);
+  coeffs[0] = ya - coeffs[1] * xa;
+}
+
+std::unique_ptr<scalar_t[]> InterpolateOnLargerDomain(Polynomial_t * p, int n, scalar_t shift = scalar_t::one()) {
+  const int deg = p->degree();
+  auto input = std::make_unique<scalar_t[]>(n);
+  // TBD: check if scalar_t constructor initializes to zero
+  for (int i = 0; i < n; ++i) {
+    input[i] = scalar_t::zero();
+  }
+  p->copy_coeffs(input.get(), 0/*start*/, deg);
+  auto ntt_config = ntt::default_ntt_config<scalar_t>();
+  ntt_config.coset_gen = shift;
+  auto evals_h = std::make_unique<scalar_t[]>(n);
+  auto err = ntt::ntt(input.get(), n, ntt::NTTDir::kForward, ntt_config, evals_h.get());
+  return evals_h;
+}
+
+int main(int argc, char** argv)
+{
+  std::cout << "This is an ICICLE C++ implementation of the STARK by Hand Explainer." << std::endl;
+  std::cout << "https://dev.risczero.com/proof-system/stark-by-hand" << std::endl;
+
+  const int logn=3;
+  const int n = 1 << logn;
+  
+  std::cout << "Initializing NTT" << std::endl;
+  static const int MAX_NTT_LOG_SIZE = 24;
+  auto ntt_config = ntt::default_ntt_config<scalar_t>();
+  const scalar_t basic_root = scalar_t::omega(MAX_NTT_LOG_SIZE);
+  ntt::init_domain(basic_root, ntt_config.ctx);
+  std::cout << "Initializing Polynomials" << std::endl;
+  // Virtual factory design pattern: initializing polynomimals factory for CUDA backend
+  Polynomial_t::initialize(std::make_unique<CUDAPolynomialFactory<>>());
+
+  std::cout << std::endl << "Lesson 1: The Execution Trace" << std::endl; 
+  // Trace: Data Columns
+  rv_t rv_d1_trace[] = {24, 30, 54,  84, 78, 15, 29, 50};
+  rv_t rv_d2_trace[] = {30, 54, 84,  138, 2, 77, 21, 36};
+  rv_t rv_d3_trace[] = {54, 84, 138, 222, 71, 17, 92, 33};
+  auto d1_trace = std::make_unique<scalar_t[]>(n);
+  auto d2_trace = std::make_unique<scalar_t[]>(n);
+  auto d3_trace = std::make_unique<scalar_t[]>(n);
+  to_ff(rv_d1_trace, d1_trace.get(), n);
+  to_ff(rv_d2_trace, d2_trace.get(), n);
+  to_ff(rv_d3_trace, d3_trace.get(), n);
+  // Trace: Control Columns
+  // Init steps are flagged in c1_trace
+  // Computation steps are flagged in c2_trace
+  // Termination step is flagged in c3_trace
+  // 0s at the end of each control column correspond to the padding of the trace
+  rv_t rv_c1_trace[] = {1, 0, 0, 0, 0, 0, 0, 0};
+  rv_t rv_c2_trace[] = {0, 1, 1, 1, 0, 0, 0, 0};
+  rv_t rv_c3_trace[] = {0, 0, 0, 1, 0, 0, 0, 0};
+  auto c1_trace = std::make_unique<scalar_t[]>(n);
+  auto c2_trace = std::make_unique<scalar_t[]>(n);
+  auto c3_trace = std::make_unique<scalar_t[]>(n);
+  to_ff(rv_c1_trace, c1_trace.get(), n);
+  to_ff(rv_c2_trace, c2_trace.get(), n);
+  to_ff(rv_c3_trace, c3_trace.get(), n);
+
+  std::cout << "Lesson 2: Rule checks to validate a computation" << std::endl;
+  std::cout << "We use rule-checking polynomials." << std::endl;
+
+  std::cout << "Lesson 3: Padding the Trace" << std::endl;
+  // The trace is padded to a power of 2 size to allow for efficient NTT operations.
+  // we already did this in the initialization of the trace data
+  // We will construct a zero-knowledge proof that:
+  // this trace represents a program that satisfies these 6 rules:
+  //  1) Fibonacci words here
+  //  2) d1_trace[0] == 24  (init 1 constraint)
+  //  3) d2_trace[0] == 30  (init 2 constraint)
+  //  4) d3_trace[3] == 28  (termination constraint)
+  //  5) if c2_trace[i] == 1, then d2_trace[i] == d1_trace[i+1]
+  //  6) if c2_trace[i] == 1, then d3_trace[i] == d2_trace[i+1}
+
+  std::cout << "Lesson 4: Constructing Trace Polynomials" << std::endl;
+  auto p_d1 = Polynomial_t::from_rou_evaluations(d1_trace.get(), n);
+  auto p_d2 = Polynomial_t::from_rou_evaluations(d2_trace.get(), n);
+  auto p_d3 = Polynomial_t::from_rou_evaluations(d3_trace.get(), n);
+  auto p_c1 = Polynomial_t::from_rou_evaluations(c1_trace.get(), n);
+  auto p_c2 = Polynomial_t::from_rou_evaluations(c2_trace.get(), n);
+  auto p_c3 = Polynomial_t::from_rou_evaluations(c3_trace.get(), n);
+
+  std::cout << "Lesson 5: ZK Commitments of the Trace Data" << std::endl;
+  std::cout << "To maintain a zk protocol, the trace polynomials are evaluated over a zk commitment domain" << std::endl;
+  std::cout << "zk commitment domain is a coset of Reed Solomon domain shifted by a basic root of unity" << std::endl;
+  scalar_t xzk = basic_root;
+  p_print(&p_d1, logn, xzk, "ZK commitment for d1 polynomial");   
+  std::cout << "Build Merkle Tree for ZK commitments (outside the scope of this example)" << std::endl;
+
+  std::cout << "Lesson 6: Constraint Polynomials" << std::endl;
+  std::cout << "The constraints are used to check the correctness of the trace. In this example, we check 6 rules to establish the validity of the trace." << std::endl;
+  auto p_fib_constraint =  (p_d3 - p_d2 - p_d1) * (p_c1 + p_c2 + p_c3);
+  auto fib_constraint_zkcommitment = InterpolateOnLargerDomain(&p_fib_constraint, 4*n, xzk);  
+    
+  auto p_init1_constraint = (p_d1 - p_value(scalar_t::from(24))) * p_c1;
+  // sanity checks printing
+  p_print(&p_init1_constraint, logn+2, scalar_t::one(), "Reed-Solomon constraint polynomial gives 0s in every 4th row");
+  p_print(&p_init1_constraint, logn+2, xzk, "ZK Commitment constraint polynomial gives no 0s");
+  auto p_init2_constraint = (p_d2 - p_value(scalar_t::from(30))) * p_c1;
+  auto p_termination_constraint = (p_d3 - p_value(scalar_t::from(222))) * p_c3;
+  auto p_recursion_constraint1 = (p_d1 - p_rotate(&p_d2, logn)) * p_c2;
+  auto p_recursion_constraint2 = (p_d2 - p_rotate(&p_d3, logn)) * p_c2;
+
+  std::cout << std::endl << "Lesson 7: Mixing Constraint Polynomials" << std::endl;  
+  Polynomial_t * p_all_constraints[] = {&p_fib_constraint, &p_init1_constraint, &p_init2_constraint, &p_termination_constraint, &p_recursion_constraint1, &p_recursion_constraint2};
+  const size_t nmix = sizeof(p_all_constraints) / sizeof(p_all_constraints[0]);
+  auto p_mixed_constraints = p_mix(p_all_constraints, nmix, scalar_t::from(5));
+  std::cout << "All constraint polynomials are low-degree:" << std::endl;
+  for( int i = 0; i < nmix; ++i) {
+    std::cout << i << ": " << p_all_constraints[i]->degree() << std::endl;
+  }
+
+  std::cout << "Lesson 8: The Core of the RISC Zero STARK" << std::endl;
+  std::cout << "Degree of the mixed constraints polynomial: " << p_mixed_constraints.degree() << std::endl;  
+  auto p_validity = p_mixed_constraints.divide_by_vanishing_polynomial(n);
+  std::cout << "Degree of the validity polynomial: " << p_validity.degree() << std::endl;
+  std::cout << "The Verifier should provide the Merke commitment for the above" << std::endl;
+
+  std::cout << "Lesson 9: The DEEP Technique" << std::endl;
+  std::cout << "The DEEP technique improves the security of a single query by sampling outside of the commitment domain."  << std::endl;
+  // In the original STARK protocol, the Verifier tests validity polynomial at a number of test points; 
+  // the soundness of the protocol depends on the number of tests. 
+  // The DEEP-ALI technique allows us to achieve a high degree of soundness with a single test. 
+  // The details of DEEP are described in the following lesson.
+
+  auto DEEP_point = scalar_t::from(93);
+  std::cout << "The prover convinces the verifier that V=C/Z at the DEEP_test_point, " << DEEP_point << std::endl;
+  const scalar_t coeffs1[2] = {scalar_t::zero()-DEEP_point, scalar_t::one()};
+  auto denom_DEEP1 = Polynomial_t::from_coefficients(coeffs1, 2);
+  auto [p_d1_DEEP, r] = (p_d1 - p_value(DEEP_point)).divide(denom_DEEP1);
+  std::cout << "The DEEP d1 degree is: " << p_d1_DEEP.degree() << std::endl;
+  // d2, d3 use recursion constraints and need the point corresponding to the previous state (clock cycle)
+  auto omega = scalar_t::omega(logn);
+  auto DEEP_prev_point = DEEP_point*scalar_t::inverse(omega); 
+  auto coeffs2 = std::make_unique<scalar_t[]>(2);
+  coeffs2[0] = scalar_t::zero() - DEEP_prev_point;
+  coeffs2[1] = scalar_t::one();
+  auto denom_DEEP2 = Polynomial_t::from_coefficients(coeffs2.get(), 2);
+
+  auto coeffs_d2bar = std::make_unique<scalar_t[]>(2);
+  solve_linear(DEEP_point, p_d2(DEEP_point), DEEP_prev_point, p_d2(DEEP_prev_point), coeffs_d2bar.get());
+  auto d2bar = Polynomial_t::from_coefficients(coeffs_d2bar.get(), 2);
+  auto [p_d2_DEEP, r2] = (p_d2 - d2bar).divide(denom_DEEP1*denom_DEEP2);
+  std::cout << "The DEEP d2 degree is: " << p_d2_DEEP.degree() << std::endl;
+
+  auto coeffs_d3bar = std::make_unique<scalar_t[]>(2);
+  solve_linear(DEEP_point, p_d3(DEEP_point), DEEP_prev_point, p_d3(DEEP_prev_point), coeffs_d3bar.get());
+  auto d3bar = Polynomial_t::from_coefficients(coeffs_d3bar.get(), 2);
+  auto [p_d3_DEEP, r3] = (p_d3 - d3bar).divide(denom_DEEP1*denom_DEEP2);
+  std::cout << "The DEEP d3 degree is: " << p_d3_DEEP.degree() << std::endl;
+
+  // DEEP c{1,2,3} polynomials
+  const scalar_t coeffs_c1bar[1] = {p_c1(DEEP_point)};
+  auto c1bar = Polynomial_t::from_coefficients(coeffs_c1bar, 1);
+  auto [p_c1_DEEP, r_c1] = (p_c1 - c1bar).divide(denom_DEEP1);
+  std::cout << "The DEEP c1 degree is: " << p_c1_DEEP.degree() << std::endl;
+  const scalar_t coeffs_c2bar[1] = {p_c2(DEEP_point)};
+  auto c2bar = Polynomial_t::from_coefficients(coeffs_c2bar, 1);
+  auto [p_c2_DEEP, r_c2] = (p_c2 - c2bar).divide(denom_DEEP1);
+  std::cout << "The DEEP c2 degree is: " << p_c2_DEEP.degree() << std::endl;
+  const scalar_t coeffs_c3bar[1] = {p_c3(DEEP_point)};
+  auto c3bar = Polynomial_t::from_coefficients(coeffs_c3bar, 1);
+  auto [p_c3_DEEP, r_c3] = (p_c3 - c3bar).divide(denom_DEEP1);
+  std::cout << "The DEEP c3 degree is: " << p_c3_DEEP.degree() << std::endl;
+  // DEEP validity polynomial
+  const scalar_t coeffs_vbar[1] = {p_validity(DEEP_point)};
+  auto vbar = Polynomial_t::from_coefficients(coeffs_vbar, 1);
+  auto [v_DEEP, r_v] = (p_validity - vbar).divide(denom_DEEP1);
+  std::cout << "The DEEP validity polynomial degree is: " << v_DEEP.degree() << std::endl;
+  std::cout << "The Prover sends DEEP polynomials to the Verifier" << std::endl;
+
+  std::cout << "Lesson 10: Mixing (Batching) for FRI" << std::endl;
+  std::cout << "The initial FRI polynomial is the mix of the 7 DEEP polynomials." << std::endl;
+  Polynomial_t* all_DEEP[] = {&p_d1_DEEP, &p_d2_DEEP, &p_d3_DEEP, &p_c1_DEEP, &p_c2_DEEP, &p_c3_DEEP, &v_DEEP};
+  Polynomial_t fri_input = p_mix(all_DEEP, 7, scalar_t::from(99));
+  std::cout << "The degree of the mixed DEEP polynomial is: " << fri_input.degree() << std::endl;
+
+  std::cout << "Lesson 11: FRI Protocol (Commit Phase)" << std::endl;
+  std::cout << "The prover provides information to convince the verifier that the DEEP polynomials are low-degree." << std::endl;
+  int nof_rounds = 3;
+  Polynomial_t feven[nof_rounds], fodd[nof_rounds], fri[nof_rounds+1];
+  scalar_t rfri[nof_rounds];
+  fri[0] = fri_input.clone();
+  for (int i = 0; i < nof_rounds; ++i) {
+    feven[i] = fri[i].even();
+    fodd[i] = fri[i].odd();
+    rfri[i] = scalar_t::rand_host();  
+    fri[i+1] = feven[i] + rfri[i]*fodd[i];
+    std::cout << "The degree of the Round " << i << " polynomial is: " << fri[i+1].degree() << std::endl;
+  }
+
+  std::cout << "Lesson 12: FRI Protocol (Query Phase)" << std::endl;
+  // We use Polynomial API to evaluate the FRI polynomials
+  // In practice, verifier will use Merkle commitments
+  auto xp = scalar_t::rand_host();
+  auto xm = scalar_t::zero() - xp;
+  scalar_t lhs[nof_rounds], rhs[nof_rounds];
+  for (int i = 0; i < nof_rounds; ++i) {
+    rhs[i] = (rfri[i]+xp)*fri[i](xp)*scalar_t::inverse(scalar_t::from(2)*xp) + (rfri[i]+xm)*fri[i](xm)*scalar_t::inverse(scalar_t::from(2)*xm);
+    lhs[i] = fri[i+1](xp*xp);
+    std::cout << "Round " << i << std::endl << "rhs: " << rhs[i] << std::endl << "lhs: " << lhs[i] << std::endl;
+  }
+
+  return 0;
+}
\ No newline at end of file
diff --git a/examples/c++/risc0/run.sh b/examples/c++/risc0/run.sh
new file mode 100755
index 00000000..01eca66b
--- /dev/null
+++ b/examples/c++/risc0/run.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+./build/example/example