diff --git a/docs/docs/icicle/polynomials/overview.md b/docs/docs/icicle/polynomials/overview.md
index 694fd12b..a57a14b7 100644
--- a/docs/docs/icicle/polynomials/overview.md
+++ b/docs/docs/icicle/polynomials/overview.md
@@ -128,12 +128,13 @@ auto H = (A*B-C).divide_by_vanishing_polynomial(N);
 
 ### Evaluation
 
-Evaluate polynomials at arbitrary domain points or across a domain.
+Evaluate polynomials at arbitrary domain points, across a domain or on a roots-of-unity domain.
 
 ```cpp
 Image operator()(const Domain& x) const; // evaluate f(x)
 void evaluate(const Domain* x, Image* evals /*OUT*/) const;
 void evaluate_on_domain(Domain* domain, uint64_t size, Image* evals /*OUT*/) const; // caller allocates memory
+void evaluate_on_rou_domain(uint64_t domain_log_size, Image* evals /*OUT*/) const;  // caller allocate memory
 ```
 
 Example:
@@ -147,18 +148,13 @@ uint64_t domain_size = ...;
 auto domain = /*build domain*/; // host or device memory
 auto evaluations = std::make_unique<scalar_t[]>(domain_size); // can be device memory too
 f.evaluate_on_domain(domain, domain_size, evaluations);
+
+// evaluate f(x) on roots of unity domain
+uint64_t domain_log_size = ...;
+auto evaluations_rou_domain = std::make_unique<scalar_t[]>(1 << domain_log_size); // can be device memory too
+f.evaluate_on_rou_domain(domain_log_size, evaluations_rou_domain);
 ```
 
-:::note
-For special domains such as roots of unity, this method is not the most efficient for two reasons:
-
-- Need to build the domain of size N.
-- The implementation is not trying to identify this special domain.
-
-Therefore the computation is typically $O(n^2)$ rather than $O(nlogn)$.
-See the 'device views' section for more details.
-:::
-
 ### Manipulations
 
 Beyond arithmetic, the API supports efficient polynomial manipulations:
@@ -255,7 +251,7 @@ auto rv = msm::MSM(coeffs_device, points, msm_size, cfg, results);
 
 #### Views
 
-The Polynomial API supports efficient data handling through the use of memory views. These views provide direct access to the polynomial's internal state, such as coefficients or evaluations without the need to copy data. This feature is particularly useful for operations that require direct access to device memory, enhancing both performance and memory efficiency.
+The Polynomial API supports efficient data handling through the use of memory views. These views provide direct access to the polynomial's internal state without the need to copy data. This feature is particularly useful for operations that require direct access to device memory, enhancing both performance and memory efficiency.
 
 ##### What is a Memory View?
 
@@ -265,7 +261,7 @@ A memory view is essentially a pointer to data stored in device memory. By provi
 
 Memory views are extremely versatile and can be employed in various computational contexts such as:
 
-- **Commitments**: Views can be used to commit polynomial states in cryptographic schemes, such as Multi-Scalar Multiplications (MSM), or for constructing Merkle trees without duplicating the underlying data.
+- **Commitments**: Views can be used to commit polynomial states in cryptographic schemes, such as Multi-Scalar Multiplications (MSM).
 - **External Computations**: They allow external functions or algorithms to utilize the polynomial's data directly, facilitating operations outside the core polynomial API. This is useful for custom operations that are not covered by the API.
 
 ##### Obtaining and Using Views
@@ -275,9 +271,6 @@ To create and use views within the Polynomial API, functions are provided to obt
 ```cpp
 // Obtain a view of the polynomial's coefficients
 std::tuple<IntegrityPointer<Coeff>, uint64_t /*size*/, uint64_t /*device_id*/> get_coefficients_view();
-// obtain a view of the evaluations. Can specify the domain size and whether to compute reversed evaluations.
-std::tuple<IntegrityPointer<Image>, uint64_t /*size*/, uint64_t /*device_id*/>
-get_rou_evaluations_view(uint64_t nof_evaluations = 0, bool is_reversed = false);
 ```
 
 Example usage:
@@ -328,22 +321,7 @@ if (coeff_view.isValid()) {
 }
 ```
 
-#### Evaluations View: Accessing Polynomial Evaluations Efficiently
 
-The Polynomial API offers a specialized method, `get_rou_evaluations_view(...)`, which facilitates direct access to the evaluations of a polynomial. This method is particularly useful for scenarios where polynomial evaluations need to be accessed frequently or manipulated externally without the overhead of copying data.
-This method provides a memory view into the device memory where polynomial evaluations are stored. It allows for efficient interpolation on larger domains, leveraging the raw evaluations directly from memory.
-
-:::warning
-Invalid request: requesting evaluations on a domain smaller than the degree of the polynomial is not supported and is considered invalid.
-:::
-
-```cpp
-// Assume a polynomial `p` of degree N
-auto [evals_view, size, device_id] = p.get_rou_evaluations_view(4*N); // expanding the evaluation domain
-
-// Use the evaluations view to perform further computations or visualizations
-process_polynomial_evaluations(evals_view.get(), size, device_id);
-```
 
 ## Multi-GPU Support with CUDA Backend
 
diff --git a/docs/docs/icicle/rust-bindings/polynomials.md b/docs/docs/icicle/rust-bindings/polynomials.md
index bcb8a875..c168aff4 100644
--- a/docs/docs/icicle/rust-bindings/polynomials.md
+++ b/docs/docs/icicle/rust-bindings/polynomials.md
@@ -67,6 +67,9 @@ where
         evals: &mut E,
     );
 
+    // Method to evaluate the polynomial over the roots-of-unity domain for power-of-two sized domain
+    fn eval_on_rou_domain<E: HostOrDeviceSlice<Self::Field> + ?Sized>(&self, domain_log_size: u64, evals: &mut E);
+
     // Method to retrieve a coefficient at a specific index.
     fn get_coeff(&self, idx: u64) -> Self::Field;
 
@@ -228,6 +231,11 @@ let f_x = f.eval(&x);  // Evaluate f at x
 let domain = [one, two, three];
 let mut host_evals = vec![ScalarField::zero(); domain.len()];
 f.eval_on_domain(HostSlice::from_slice(&domain), HostSlice::from_mut_slice(&mut host_evals));
+
+// Evaluate on roots-of-unity-domain
+let domain_log_size = 4;
+let mut device_evals = DeviceVec::<ScalarField>::cuda_malloc(1 << domain_log_size).unwrap();
+f.eval_on_rou_domain(domain_log_size, &mut device_evals[..]);
 ```
 
 ### Read coefficients
diff --git a/icicle/include/fields/field.cuh b/icicle/include/fields/field.cuh
index 2c2bdd0a..52a39128 100644
--- a/icicle/include/fields/field.cuh
+++ b/icicle/include/fields/field.cuh
@@ -1001,6 +1001,17 @@ public:
     }
     return (u == one) ? b : c;
   }
+
+  static constexpr HOST_DEVICE_INLINE Field pow(Field base, int exp)
+  {
+    Field res = one();
+    while (exp > 0) {
+      if (exp & 1) res = res * base;
+      base = base * base;
+      exp >>= 1;
+    }
+    return res;
+  }
 };
 
 template <class CONFIG>
diff --git a/icicle/include/polynomials/polynomial_backend.h b/icicle/include/polynomials/polynomial_backend.h
index 786b4f60..45d27b49 100644
--- a/icicle/include/polynomials/polynomial_backend.h
+++ b/icicle/include/polynomials/polynomial_backend.h
@@ -56,6 +56,7 @@ namespace polynomials {
     // Evaluation methods
     virtual void evaluate(PolyContext op, const D* domain_x, I* eval /*OUT*/) = 0;
     virtual void evaluate_on_domain(PolyContext op, const D* domain, uint64_t size, I* evaluations /*OUT*/) = 0;
+    virtual void evaluate_on_rou_domain(PolyContext op, uint64_t domain_log_size, I* evals /*OUT*/) = 0;
 
     // Methods to copy coefficients to host memory
     virtual C get_coeff(PolyContext op, uint64_t coeff_idx) = 0;
@@ -64,8 +65,6 @@ namespace polynomials {
     // Methods to get views of coefficients and evaluations, including device id
     virtual std::tuple<IntegrityPointer<C>, uint64_t /*size*/, uint64_t /*device_id*/>
     get_coefficients_view(PolyContext p) = 0;
-    virtual std::tuple<IntegrityPointer<I>, uint64_t /*size*/, uint64_t /*device_id*/>
-    get_rou_evaluations_view(PolyContext p, uint64_t nof_evaluations = 0, bool is_reversed = false) = 0;
   };
 
 } // namespace polynomials
diff --git a/icicle/include/polynomials/polynomial_context.h b/icicle/include/polynomials/polynomial_context.h
index bd8e2dba..2cd95bb4 100644
--- a/icicle/include/polynomials/polynomial_context.h
+++ b/icicle/include/polynomials/polynomial_context.h
@@ -71,10 +71,8 @@ namespace polynomials {
     virtual std::pair<const C*, uint64_t> get_coefficients() = 0;
     virtual std::pair<const I*, uint64_t> get_rou_evaluations() = 0;
 
-    // Methods to get views of coefficients and evaluations, including device id.
+    // Methods to get views of coefficients
     virtual std::tuple<IntegrityPointer<C>, uint64_t /*size*/, uint64_t /*device_id*/> get_coefficients_view() = 0;
-    virtual std::tuple<IntegrityPointer<I>, uint64_t /*size*/, uint64_t /*device_id*/>
-    get_rou_evaluations_view(uint64_t nof_evaluations = 0, bool is_reversed = false) = 0;
 
     // Method for printing the context state to an output stream.
     virtual void print(std::ostream& os) = 0;
diff --git a/icicle/include/polynomials/polynomials.h b/icicle/include/polynomials/polynomials.h
index 3a436eea..7916b837 100644
--- a/icicle/include/polynomials/polynomials.h
+++ b/icicle/include/polynomials/polynomials.h
@@ -68,6 +68,7 @@ namespace polynomials {
     Image operator()(const Domain& x) const;
     void evaluate(const Domain* x, Image* eval /*OUT*/) const;
     void evaluate_on_domain(Domain* domain, uint64_t size, Image* evals /*OUT*/) const; // caller allocates memory
+    void evaluate_on_rou_domain(uint64_t domain_log_size, Image* evals /*OUT*/) const;  // caller allocate memory
 
     // Method to obtain the degree of the polynomial
     int64_t degree();
@@ -77,10 +78,8 @@ namespace polynomials {
     // caller is allocating output memory. If coeff==nullptr, returning nof_coeff only
     uint64_t copy_coeffs(Coeff* host_coeffs, uint64_t start_idx, uint64_t end_idx) const;
 
-    // Methods for obtaining a view of the coefficients or evaluations
+    // Methods for obtaining a view of the coefficients
     std::tuple<IntegrityPointer<Coeff>, uint64_t /*size*/, uint64_t /*device_id*/> get_coefficients_view();
-    std::tuple<IntegrityPointer<Image>, uint64_t /*size*/, uint64_t /*device_id*/>
-    get_rou_evaluations_view(uint64_t nof_evaluations = 0, bool is_reversed = false);
 
     // Overload stream insertion operator for printing.
     friend std::ostream& operator<<(std::ostream& os, Polynomial& poly)
diff --git a/icicle/src/polynomials/cuda_backend/kernels.cuh b/icicle/src/polynomials/cuda_backend/kernels.cuh
index 61e7d881..2d1032b6 100644
--- a/icicle/src/polynomials/cuda_backend/kernels.cuh
+++ b/icicle/src/polynomials/cuda_backend/kernels.cuh
@@ -38,18 +38,6 @@ namespace polynomials {
   }
 
   /*============================== evaluate ==============================*/
-  template <typename T>
-  __device__ T pow(T base, int exp)
-  {
-    T result = T::one();
-    while (exp > 0) {
-      if (exp & 1) result = result * base;
-      base = base * base;
-      exp >>= 1;
-    }
-    return result;
-  }
-
   // TODO Yuval: implement efficient reduction and support batch evaluation
   template <typename T>
   __global__ void dummy_reduce(const T* arr, int size, T* output)
@@ -67,7 +55,7 @@ namespace polynomials {
   __global__ void evaluate_polynomial_without_reduction(const T* x, const T* coeffs, int num_coeffs, T* tmp)
   {
     const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    if (tid < num_coeffs) { tmp[tid] = coeffs[tid] * pow(*x, tid); }
+    if (tid < num_coeffs) { tmp[tid] = coeffs[tid] * T::pow(*x, tid); }
   }
 
   /*============================== division ==============================*/
diff --git a/icicle/src/polynomials/cuda_backend/polynomial_cuda_backend.cu b/icicle/src/polynomials/cuda_backend/polynomial_cuda_backend.cu
index a4cd769b..1b9d5d14 100644
--- a/icicle/src/polynomials/cuda_backend/polynomial_cuda_backend.cu
+++ b/icicle/src/polynomials/cuda_backend/polynomial_cuda_backend.cu
@@ -201,20 +201,6 @@ namespace polynomials {
       return {std::move(integrity_pointer), N, m_device_context.device_id};
     }
 
-    std::tuple<IntegrityPointer<I>, uint64_t, uint64_t>
-    get_rou_evaluations_view(uint64_t nof_evaluations, bool is_reversed)
-    {
-      if (nof_evaluations != 0 && nof_evaluations < get_nof_elements()) {
-        THROW_ICICLE_ERR(IcicleError_t::InvalidArgument, "get_rou_evaluations_view() can only expand #evals");
-      }
-      transform_to_evaluations(nof_evaluations, is_reversed);
-      auto [evals, N] = get_rou_evaluations();
-      // when reading the pointer, if the counter was modified, the pointer is invalid
-      IntegrityPointer<I> integrity_pointer(evals, m_integrity_counter, *m_integrity_counter);
-      CHK_STICKY(cudaStreamSynchronize(m_device_context.stream));
-      return {std::move(integrity_pointer), N, m_device_context.device_id};
-    }
-
     std::pair<const I*, uint64_t> get_rou_evaluations() override
     {
       const bool is_reversed = this->m_state == State::EvaluationsOnRou_Reversed;
@@ -598,10 +584,9 @@ namespace polynomials {
 
       const int64_t deg_a = degree(a);
       const int64_t deg_b = degree(b);
-      if (deg_a < deg_b || deg_b < 0) {
+      if (deg_b < 0) {
         THROW_ICICLE_ERR(
-          IcicleError_t::InvalidArgument, "Polynomial division (CUDA backend): numerator degree must be "
-                                          "greater-or-equal to denumerator degree and denumerator must not be zero");
+          IcicleError_t::InvalidArgument, "Polynomial division (CUDA backend): divide by zeropolynomial ");
       }
 
       // init: Q=0, R=a
@@ -649,10 +634,45 @@ namespace polynomials {
     {
       assert_device_compatability(numerator, out);
 
-      // TODO Yuval: vanishing polynomial x^n-1 evaluates to zero on ROU
-      // Therefore constant on coset with u as coset generator ((wu)^n-1 = w^n*u^n-1 = u^n-1)
-      // This is true for a coset of size n but if numerator is of size >n, then I need a larger coset and it
-      // doesn't hold. Need to use this fact to optimize division
+      // vanishing polynomial of degree N is the polynomial V(x) such that V(r)=0 for r Nth root-of-unity.
+      // For example for N=4 it vanishes on the group [1,W,W^2,W^3] where W is the 4th root of unity. In that
+      // case V(x)=(x-1)(x-w)(x-w^2)(x-w^3). It can be easily shown that V(x)=x^N-1. This holds since x^N=1 on this
+      // domain (since x is the Nth root of unity).
+
+      // Note that we always represent polynomials with N elements for N a power of two. This is required for NTTs.
+      // In addition we consider deg(P) to be this number of elements N even though the real degree may be lower. for
+      // example 1+x-2x^2 is degree 2 but we store 4 elements and consider it degree 3.
+
+      // when dividing a polynomial  P(x)/V(x) (The vanishing polynomial) the output is of degree deg(P)-deg(V). There
+      // are three cases where V(x) divides P(x) (this is assumed since otherwise the output polynomial does not
+      // exist!):
+      // (1) deg(P)=2*deg(V): in that case deg(P/V)=deg(V)=N. This is an efficient case since on a domain of size N, the
+      // vanishing polynomial evaluates to a constant value.
+      // (2) deg(P)=deg(V)=N: in that case the output is a degree 0 polynomial.
+      // polynomial (i.e. scalar). (3) general case: deg(P)>2*deg(V): in that case deg(P) is a least 4*deg(V) since N is
+      // a power of two. This means that deg(P/V)=deg(P). For example deg(P)=16, deg(V)=4 --> deg(P/V)=12 ceiled to 16.
+
+      // When computing we want to divide P(x)'s evals by V(x)'s evals. Since V(x)=0 on this domain we have to compute
+      // on a coset.
+      // for case (3) we must evaluate V(x) on deg(P) domain size and compute elementwise division on a coset.
+      // case (1) is more efficient because we need N evaluations of V(x) on a coset. Note that V(x)=constant on a coset
+      // of rou. This is because V(wu)=(wu)^N-1=W^N*u^N-1 = 1*u^N-1 (as w^N=1 for w Nth root of unity). case (2) can be
+      // computed like case (1).
+
+      const bool is_case_2N = numerator->get_nof_elements() == 2 * vanishing_poly_degree;
+      const bool is_case_N = numerator->get_nof_elements() == vanishing_poly_degree;
+      if (is_case_2N) {
+        divide_by_vanishing_case_2N(out, numerator, vanishing_poly_degree);
+      } else if (is_case_N) {
+        divide_by_vanishing_case_N(out, numerator, vanishing_poly_degree);
+      } else {
+        divide_by_vanishing_general_case(out, numerator, vanishing_poly_degree);
+      }
+    }
+
+    void divide_by_vanishing_general_case(PolyContext out, PolyContext numerator, uint64_t vanishing_poly_degree)
+    {
+      // General case: P(x)/V(x) where v is of degree N and p of any degree>N
 
       // (1) allocate vanishing polynomial in coefficients form
       // TODO Yuval: maybe instead of taking numerator memory and modiyfing it diretcly add a state for evaluations
@@ -688,12 +708,89 @@ namespace polynomials {
       div_element_wise_kernel<<<NOF_BLOCKS, NOF_THREADS, 0, m_device_context.stream>>>(
         numerator_coeffs, out_coeffs, N, out_coeffs);
 
-      // (4) INTT back both a and out
+      // (4) INTT back both numerator and out
       ntt_config.ordering = ntt::Ordering::kMN;
       CHK_STICKY(ntt::ntt(out_coeffs, N, ntt::NTTDir::kInverse, ntt_config, out_coeffs));
       CHK_STICKY(ntt::ntt(numerator_coeffs, N, ntt::NTTDir::kInverse, ntt_config, numerator_coeffs));
     }
 
+    void divide_by_vanishing_case_2N(PolyContext out, PolyContext numerator, uint64_t vanishing_poly_degree)
+    {
+      // in that special case the numertaor has 2N elements and output will be N elements
+      if (numerator->get_nof_elements() != 2 * vanishing_poly_degree) {
+        THROW_ICICLE_ERR(IcicleError_t::UndefinedError, "invalid input size. Expecting numerator to be of size 2N");
+      }
+
+      // In the case where deg(P)=2N, I can transform numerator to Reversed-evals -> The second half is
+      // a reversed-coset of size N with coset-gen the 2N-th root of unity.
+      const int N = vanishing_poly_degree;
+      numerator->transform_to_evaluations(2 * N, true /*=reversed*/);
+      // allocate output in coeffs because it will be calculated on a coset but I don't have such a state so will have
+      // to INTT back to coeffs
+      auto numerator_evals_reversed_p = get_context_storage_immutable<I>(numerator);
+      out->allocate(N, State::Coefficients, false /*=set zeros*/);
+      auto out_evals_reversed_p = get_context_storage_mutable<I>(out);
+
+      auto ntt_config = ntt::default_ntt_config<C>(m_device_context);
+      ntt_config.coset_gen = ntt::get_root_of_unity_from_domain<D>((uint64_t)log2(2 * N), ntt_config.ctx);
+      // compute inv(u^N-1);
+      D v_coset_eval = D::inverse(D::pow(ntt_config.coset_gen, N) - D::one());
+
+      const int NOF_THREADS = 128;
+      const int NOF_BLOCKS = (N + NOF_THREADS - 1) / NOF_THREADS;
+      mul_scalar_kernel<<<NOF_BLOCKS, NOF_THREADS, 0, m_device_context.stream>>>(
+        numerator_evals_reversed_p + N /*second half is the reversed coset*/, v_coset_eval, N, out_evals_reversed_p);
+
+      // INTT back from reversed evals on coset to coeffs
+      ntt_config.are_inputs_on_device = true;
+      ntt_config.are_outputs_on_device = true;
+      ntt_config.is_async = true;
+      ntt_config.ordering = ntt::Ordering::kRN;
+      ntt::ntt(out_evals_reversed_p, N, ntt::NTTDir::kInverse, ntt_config, out_evals_reversed_p);
+
+      CHK_LAST();
+    }
+
+    void divide_by_vanishing_case_N(PolyContext out, PolyContext numerator, uint64_t vanishing_poly_degree)
+    {
+      // in that special case the numertaor has N elements and output will be N elements
+      if (numerator->get_nof_elements() != vanishing_poly_degree) {
+        THROW_ICICLE_ERR(IcicleError_t::UndefinedError, "invalid input size. Expecting numerator to be of size N");
+      }
+
+      const int N = vanishing_poly_degree;
+      numerator->transform_to_coefficients(N);
+      auto numerator_evals_reversed_p = get_context_storage_immutable<I>(numerator);
+      out->allocate(N, State::Coefficients, false /*=set zeros*/);
+      auto out_evals_reversed_p = get_context_storage_mutable<I>(out);
+
+      // (1) NTT numerator to coset evals (directly to out)
+      auto ntt_config = ntt::default_ntt_config<C>(m_device_context);
+      ntt_config.coset_gen = ntt::get_root_of_unity_from_domain<D>((uint64_t)log2(2 * N), ntt_config.ctx);
+      ntt_config.are_inputs_on_device = true;
+      ntt_config.are_outputs_on_device = true;
+      ntt_config.is_async = true;
+      ntt_config.ordering = ntt::Ordering::kNM;
+      ntt::ntt(numerator_evals_reversed_p, N, ntt::NTTDir::kForward, ntt_config, out_evals_reversed_p);
+
+      // (2) divide by constant value (that V(x) evaluates to on the coset)
+      D v_coset_eval = D::inverse(D::pow(ntt_config.coset_gen, N) - D::one());
+
+      const int NOF_THREADS = 128;
+      const int NOF_BLOCKS = (N + NOF_THREADS - 1) / NOF_THREADS;
+      mul_scalar_kernel<<<NOF_BLOCKS, NOF_THREADS, 0, m_device_context.stream>>>(
+        out_evals_reversed_p, v_coset_eval, N, out_evals_reversed_p);
+
+      // (3) INTT back from coset to coeffs
+      ntt_config.are_inputs_on_device = true;
+      ntt_config.are_outputs_on_device = true;
+      ntt_config.is_async = true;
+      ntt_config.ordering = ntt::Ordering::kMN;
+      ntt::ntt(out_evals_reversed_p, N, ntt::NTTDir::kInverse, ntt_config, out_evals_reversed_p);
+
+      CHK_LAST();
+    }
+
     // arithmetic with monomials
     void add_monomial_inplace(PolyContext& poly, C monomial_coeff, uint64_t monomial) override
     {
@@ -776,6 +873,72 @@ namespace polynomials {
       }
     }
 
+    void evaluate_on_rou_domain(PolyContext p, uint64_t domain_log_size, I* evals /*OUT*/) override
+    {
+      const uint64_t poly_size = p->get_nof_elements();
+      const uint64_t domain_size = 1 << domain_log_size;
+      const bool is_evals_on_host = is_host_ptr(evals, m_device_context.device_id);
+
+      I* d_evals = evals;
+      // if evals on host, allocate CUDA memory
+      if (is_evals_on_host) { CHK_STICKY(cudaMallocAsync(&d_evals, domain_size * sizeof(I), m_device_context.stream)); }
+
+      // If domain size is smaller the polynomial size -> transform to evals and copy the evals with stride.
+      // Else, if in coeffs copy coeffs to evals mem and NTT inplace to compute the evals, else INTT to d_evals and back
+      // inplace to larger domain
+      const bool is_domain_size_smaller_than_poly_size = domain_size <= poly_size;
+      if (is_domain_size_smaller_than_poly_size) {
+        // TODO Yuval: in reversed evals, can reverse the first 'domain_size' elements to d_evals instead of
+        // transforming back to evals.
+        p->transform_to_evaluations();
+        const auto stride = poly_size / domain_size;
+        const int NOF_THREADS = 128;
+        const int NOF_BLOCKS = (domain_size + NOF_THREADS - 1) / NOF_THREADS;
+        slice_kernel<<<NOF_BLOCKS, NOF_THREADS, 0, m_device_context.stream>>>(
+          get_context_storage_immutable<I>(p), d_evals, 0 /*offset*/, stride, domain_size);
+      } else {
+        CHK_STICKY(cudaMemset(d_evals, 0, domain_size * sizeof(I)));
+        auto ntt_config = ntt::default_ntt_config<D>(m_device_context);
+        ntt_config.are_inputs_on_device = true;
+        ntt_config.are_outputs_on_device = true;
+        ntt_config.is_async = true;
+        // TODO Yuval: in evals I can NTT directly to d_evals without changing my state
+        switch (p->get_state()) {
+        case State::Coefficients: {
+          // copy to evals memory and inplace NTT of domain size
+          CHK_STICKY(
+            cudaMemcpy(d_evals, get_context_storage_immutable<I>(p), poly_size * sizeof(I), cudaMemcpyDeviceToDevice));
+          ntt_config.ordering = ntt::Ordering::kNN;
+          ntt::ntt(d_evals, domain_size, ntt::NTTDir::kForward, ntt_config, d_evals);
+        } break;
+        case State::EvaluationsOnRou_Natural:
+        case State::EvaluationsOnRou_Reversed: {
+          const bool is_from_natrual = p->get_state() == State::EvaluationsOnRou_Natural;
+          // INTT to coeffs and back to evals
+          ntt_config.ordering = is_from_natrual ? ntt::Ordering::kNM : ntt::Ordering::kRN;
+          ntt::ntt(get_context_storage_immutable<I>(p), poly_size, ntt::NTTDir::kInverse, ntt_config, d_evals);
+          ntt_config.ordering = is_from_natrual ? ntt::Ordering::kMN : ntt::Ordering::kNN;
+          ntt::ntt(d_evals, poly_size, ntt::NTTDir::kForward, ntt_config, d_evals);
+        } break;
+        default:
+          THROW_ICICLE_ERR(IcicleError_t::UndefinedError, "Invalid state to compute evaluations");
+          break;
+        }
+      }
+
+      // release CUDA memory if allocated
+      if (is_evals_on_host) {
+        CHK_STICKY(
+          cudaMemcpyAsync(evals, d_evals, domain_size * sizeof(I), cudaMemcpyDeviceToHost, m_device_context.stream));
+        CHK_STICKY(cudaFreeAsync(d_evals, m_device_context.stream));
+      }
+
+      // sync since user cannot reuse this stream so need to make sure evals are computed
+      CHK_STICKY(cudaStreamSynchronize(m_device_context.stream)); // sync to make sure return value is copied to host
+
+      CHK_LAST();
+    }
+
     uint64_t copy_coeffs(PolyContext op, C* out_coeffs, uint64_t start_idx, uint64_t end_idx) override
     {
       const uint64_t nof_coeffs = op->get_nof_elements();
@@ -809,18 +972,16 @@ namespace polynomials {
       return host_coeff;
     }
 
-    std::tuple<IntegrityPointer<C>, uint64_t /*size*/, uint64_t /*device_id*/>
+    std::tuple<
+      IntegrityPointer<C>,
+      uint64_t /*size*/
+      ,
+      uint64_t /*device_id*/>
     get_coefficients_view(PolyContext p) override
     {
       return p->get_coefficients_view();
     }
 
-    std::tuple<IntegrityPointer<I>, uint64_t /*size*/, uint64_t /*device_id*/>
-    get_rou_evaluations_view(PolyContext p, uint64_t nof_evaluations, bool is_reversed) override
-    {
-      return p->get_rou_evaluations_view(nof_evaluations, is_reversed);
-    }
-
     inline void assert_device_compatability(PolyContext a, PolyContext b) const
     {
       CUDAPolynomialContext<C, D, I>* a_cuda = static_cast<CUDAPolynomialContext<C, D, I>*>(a.get());
diff --git a/icicle/src/polynomials/polynomials.cu b/icicle/src/polynomials/polynomials.cu
index b204bb00..6e250d36 100644
--- a/icicle/src/polynomials/polynomials.cu
+++ b/icicle/src/polynomials/polynomials.cu
@@ -165,6 +165,12 @@ namespace polynomials {
     return m_backend->evaluate_on_domain(m_context, domain, size, evals);
   }
 
+  template <typename C, typename D, typename I>
+  void Polynomial<C, D, I>::evaluate_on_rou_domain(uint64_t domain_log_size, I* evals /*OUT*/) const
+  {
+    return m_backend->evaluate_on_rou_domain(m_context, domain_log_size, evals);
+  }
+
   template <typename C, typename D, typename I>
   int64_t Polynomial<C, D, I>::degree()
   {
@@ -190,13 +196,6 @@ namespace polynomials {
     return m_backend->get_coefficients_view(m_context);
   }
 
-  template <typename C, typename D, typename I>
-  std::tuple<IntegrityPointer<I>, uint64_t /*size*/, uint64_t /*device_id*/>
-  Polynomial<C, D, I>::get_rou_evaluations_view(uint64_t nof_evaluations, bool is_reversed)
-  {
-    return m_backend->get_rou_evaluations_view(m_context, nof_evaluations, is_reversed);
-  }
-
   // explicit instantiation for default type (scalar field)
   template class Polynomial<scalar_t>;
   template Polynomial<scalar_t> operator*(const scalar_t& c, const Polynomial<scalar_t>& rhs);
diff --git a/icicle/src/polynomials/polynomials_c_api.cu b/icicle/src/polynomials/polynomials_c_api.cu
index 6bf16439..8672fbd9 100644
--- a/icicle/src/polynomials/polynomials_c_api.cu
+++ b/icicle/src/polynomials/polynomials_c_api.cu
@@ -200,6 +200,16 @@ namespace polynomials {
     return p->evaluate_on_domain(domain, domain_size, evals);
   }
 
+  // Evaluates a polynomial on a ROU domain.
+  // p: Pointer to the polynomial instance.
+  // domain_log_size: log size of the domain to evaluate
+  // evals: Output array for the evaluations.
+  void CONCAT_EXPAND(FIELD, polynomial_evaluate_on_rou_domain)(
+    const PolynomialInst* p, uint64_t domain_log_size, scalar_t* evals /*OUT*/)
+  {
+    return p->evaluate_on_rou_domain(domain_log_size, evals);
+  }
+
   // Returns the degree of a polynomial.
   // p: Pointer to the polynomial instance.
   // Returns the degree of the polynomial.
@@ -245,22 +255,6 @@ namespace polynomials {
     return new IntegrityPointer<scalar_t>(std::move(coeffs));
   }
 
-  // Retrieves a device-memory view of the polynomial's evaluations on the roots of unity.
-  // p: Pointer to the polynomial instance.
-  // nof_evals: Number of evaluations.
-  // is_reversed: Whether the evaluations are in reversed order.
-  // size: Output parameter for the size of the view.
-  // device_id: Output parameter for the device ID.
-  // Returns a pointer to an integrity pointer encapsulating the evaluations view.
-  IntegrityPointer<scalar_t>* CONCAT_EXPAND(FIELD, polynomial_get_rou_evaluations_view)(
-    PolynomialInst* p, uint64_t nof_evals, bool is_reversed, uint64_t* size /*OUT*/, uint64_t* device_id /*OUT*/)
-  {
-    auto [rou_evals, _size, _device_id] = p->get_rou_evaluations_view(nof_evals, is_reversed);
-    *size = _size;
-    *device_id = _device_id;
-    return new IntegrityPointer<scalar_t>(std::move(rou_evals));
-  }
-
   // Reads the pointer from an integrity pointer.
   // p: Pointer to the integrity pointer.
   // Returns the raw pointer if still valid, otherwise NULL.
diff --git a/icicle/tests/polynomial_test.cu b/icicle/tests/polynomial_test.cu
index d8bcc130..219cc1c9 100644
--- a/icicle/tests/polynomial_test.cu
+++ b/icicle/tests/polynomial_test.cu
@@ -29,7 +29,7 @@ class PolynomialTest : public ::testing::Test
 {
 public:
   static inline const int MAX_NTT_LOG_SIZE = 24;
-  static inline const bool MEASURE = true;
+  static inline const bool MEASURE = false;
 
   // SetUpTestSuite/TearDownTestSuite are called once for the entire test suite
   static void SetUpTestSuite()
@@ -54,15 +54,17 @@ public:
     // code that executes after each test
   }
 
-  static Polynomial_t randomize_polynomial(uint32_t size, bool random = true)
+  static Polynomial_t randomize_polynomial(uint32_t size, bool random = true, bool from_evals = false)
   {
-    auto coeff = std::make_unique<scalar_t[]>(size);
+    auto elements = std::make_unique<scalar_t[]>(size);
     if (random) {
-      random_samples(coeff.get(), size);
+      random_samples(elements.get(), size);
     } else {
-      incremental_values(coeff.get(), size);
+      incremental_values(elements.get(), size);
     }
-    return Polynomial_t::from_coefficients(coeff.get(), size);
+
+    return from_evals ? Polynomial_t::from_rou_evaluations(elements.get(), size)
+                      : Polynomial_t::from_coefficients(elements.get(), size);
   }
 
   static void random_samples(scalar_t* res, uint32_t count)
@@ -163,6 +165,56 @@ TEST_F(PolynomialTest, evaluationOnDomain)
   ASSERT_EQ((f - g).degree(), -1);
 }
 
+TEST_F(PolynomialTest, evaluateOnRouDomain)
+{
+  const int logsize = 8;
+  const int size = 1 << logsize;
+  auto f = randomize_polynomial(size);
+  auto g = randomize_polynomial(size, true, true /*from_evals*/);
+
+  // build domain
+  auto test = [&](auto& p, int domain_logsize) {
+    const int domain_size = 1 << domain_logsize;
+    device_context::DeviceContext ctx = device_context::get_default_device_context();
+    scalar_t w = ntt::get_root_of_unity_from_domain<scalar_t>(domain_logsize, ctx);
+    auto domain = std::make_unique<scalar_t[]>(domain_size);
+    domain[0] = scalar_t::one();
+    for (int i = 1; i < domain_size; ++i) {
+      domain[i] = domain[i - 1] * w;
+    }
+
+    // evaluation on domain
+    auto evals_naive = std::make_unique<scalar_t[]>(domain_size);
+    START_TIMER(naive_evals);
+    p.evaluate_on_domain(domain.get(), domain_size, evals_naive.get());
+    END_TIMER(naive_evals, "naive evals took", MEASURE);
+
+    // evaluate on rou domain
+    auto evals_rou_domain = std::make_unique<scalar_t[]>(domain_size);
+    START_TIMER(rou_domain_evals);
+    p.evaluate_on_rou_domain(domain_logsize, evals_rou_domain.get());
+    END_TIMER(rou_domain_evals, "evals on rou domain took", MEASURE);
+
+    ASSERT_EQ(0, memcmp(evals_naive.get(), evals_rou_domain.get(), domain_size * sizeof(scalar_t)));
+  };
+
+  // test f (in coeffs state)
+  test(f, logsize + 2); // evaluate on larger domain
+  test(f, logsize - 3); // evaluate on smaller domain
+  test(f, logsize);     // evaluate on domain with size like poly
+  // test g (in evals state)
+  test(g, logsize + 2); // evaluate on larger domain
+  test(g, logsize - 3); // evaluate on smaller domain
+  test(g, logsize);     // evaluate on domain with size like poly
+
+  // test f*f (in reversed evals state)
+  auto f_squared = f * f;
+  auto new_logsize = logsize + 1;   // f_squared is twice the degree and size of f
+  test(f_squared, new_logsize + 2); // evaluate on larger domain
+  test(f_squared, new_logsize - 3); // evaluate on smaller domain
+  test(f_squared, new_logsize);     // evaluate on domain with size like poly
+}
+
 TEST_F(PolynomialTest, fromEvaluations)
 {
   const int size = 100;
@@ -419,40 +471,16 @@ TEST_F(PolynomialTest, View)
   const int size = 1 << 6;
 
   auto f = randomize_polynomial(size);
-  {
-    auto [d_coeff, N, device_id] = f.get_coefficients_view();
+  auto [d_coeff, N, device_id] = f.get_coefficients_view();
 
-    EXPECT_EQ(d_coeff.isValid(), true);
-    auto g = f + f;
-    // expecting the view to remain valid in that case
-    EXPECT_EQ(d_coeff.isValid(), true);
+  EXPECT_EQ(d_coeff.isValid(), true);
+  auto g = f + f;
+  // expecting the view to remain valid in that case
+  EXPECT_EQ(d_coeff.isValid(), true);
 
-    f += f;
-    // expecting view to be invalidated since f is modified
-    EXPECT_EQ(d_coeff.isValid(), false);
-  }
-
-  auto [d_evals, N, device_id] = f.get_rou_evaluations_view();
-  auto g = Polynomial_t::from_rou_evaluations(d_evals.get(), N);
-  assert_equal(f, g);
-}
-
-TEST_F(PolynomialTest, interpolation)
-{
-  const int size = 1 << 4;
-  const int interpolation_size = 1 << 6;
-
-  const auto x = scalar_t::rand_host();
-
-  auto f = randomize_polynomial(size);
-  auto [evals, N, device_id] = f.get_rou_evaluations_view(interpolation_size); // interpolate from 16 to 64 evaluations
-
-  auto g = Polynomial_t::from_rou_evaluations(evals.get(), N); // note the evals is a view to f
-  const auto fx = f(x);
-  ASSERT_EQ(evals.isValid(), false); // invaidated since f(x) transforms f to coefficients
-
-  const auto gx = g(x); // evaluating g which was constructed from interpolation of f
-  ASSERT_EQ(fx, gx);
+  f += f;
+  // expecting view to be invalidated since f is modified
+  EXPECT_EQ(d_coeff.isValid(), false);
 }
 
 TEST_F(PolynomialTest, slicing)
diff --git a/wrappers/rust/icicle-core/src/polynomials/mod.rs b/wrappers/rust/icicle-core/src/polynomials/mod.rs
index 8cad36de..ec8a4433 100644
--- a/wrappers/rust/icicle-core/src/polynomials/mod.rs
+++ b/wrappers/rust/icicle-core/src/polynomials/mod.rs
@@ -27,6 +27,7 @@ where
         domain: &D,
         evals: &mut E,
     );
+    fn eval_on_rou_domain<E: HostOrDeviceSlice<Self::Field> + ?Sized>(&self, domain_log_size: u64, evals: &mut E);
     fn get_nof_coeffs(&self) -> u64;
     fn get_coeff(&self, idx: u64) -> Self::Field;
     fn copy_coeffs<S: HostOrDeviceSlice<Self::Field> + ?Sized>(&self, start_idx: u64, coeffs: &mut S);
@@ -115,6 +116,9 @@ macro_rules! impl_univariate_polynomial_api {
             #[link_name = concat!($field_prefix, "_polynomial_evaluate_on_domain")]
             fn eval_on_domain(a: PolynomialHandle, domain: *const $field, domain_size: u64, evals: *mut $field);
 
+            #[link_name = concat!($field_prefix, "_polynomial_evaluate_on_rou_domain")]
+            fn eval_on_rou_domain(a: PolynomialHandle, domain_log_size: u64, evals: *mut $field);
+
             #[link_name = concat!($field_prefix, "_polynomial_degree")]
             fn degree(a: PolynomialHandle) -> i64;
 
@@ -255,6 +259,20 @@ macro_rules! impl_univariate_polynomial_api {
                 }
             }
 
+            fn eval_on_rou_domain<E: HostOrDeviceSlice<Self::Field> + ?Sized>(
+                &self,
+                domain_log_size: u64,
+                evals: &mut E,
+            ) {
+                assert!(
+                    evals.len() >= 1 << domain_log_size,
+                    "eval_on_rou_domain(): eval size must not be smaller than domain"
+                );
+                unsafe {
+                    eval_on_rou_domain(self.handle, domain_log_size, evals.as_mut_ptr());
+                }
+            }
+
             fn get_nof_coeffs(&self) -> u64 {
                 unsafe {
                     // returns total #coeffs. Not copying when null
@@ -732,6 +750,25 @@ macro_rules! impl_polynomial_tests {
             assert_eq!(f.eval(&host_evals_from_device[2]), host_evals[2]);
         }
 
+        #[test]
+        #[ignore]
+        fn test_eval_on_rou_domain() {
+            setup();
+
+            let poly_log_size = 10;
+            let domain_log_size = poly_log_size + 2; // interpolate 4 times
+            let f = randomize_poly(1 << poly_log_size);
+
+            // evaluate f on rou domain of size 4n
+            let mut device_evals = DeviceVec::<ScalarField>::cuda_malloc(1 << domain_log_size).unwrap();
+            f.eval_on_rou_domain(domain_log_size, &mut device_evals[..]);
+
+            // construct g from f's evals and assert they are equal
+            let g = Poly::from_rou_evals(&device_evals[..], 1 << domain_log_size);
+            let diff = &f - &g;
+            assert_eq!(diff.degree(), -1); // diff is the zero poly
+        }
+
         #[test]
         #[ignore]
         fn test_odd_even_slicing() {