#include <limits>
#include <chrono>
#include <thread>
#include <functional>

#ifndef _WIN32
#include <dlfcn.h>
#endif

#include "vkrender.h"
#include "glfw.h"
#include "shaderResources.h"
#include "picture.h"
#include "drawimage.h"
#include "EXRFiles.h"
#include "fpu.h"

#include "rendererloader.h"  // for headlessRenderer

#include "vkutils.h"
#include "ThreadSafeQueue.h"

// For debugging:
#if defined(ENABLE_VK_VALIDATION)
#define VALIDATION
#endif

#define SHADER_DIRECTORY "shaders/"
#define VALIDATION_LAYER "VK_LAYER_KHRONOS_validation"

#define VARIABLE_NAME(var) (#var)

#if defined(_WIN32)
#include <Windows.h>
#include <vulkan/vulkan_win32.h>
#define GLFW_EXPOSE_NATIVE_WIN32
#include <GLFW/glfw3native.h>
#elif defined(__APPLE__)
#include <mach-o/dyld.h>
#include <sys/stat.h>
#endif

using settings::getSetting;
using settings::Setting;

void exitHandler(int);

#ifdef HAVE_VULKAN
uint32_t apiVersion=VK_API_VERSION_1_4;

std::vector<const char*> instanceExtensions
{
  VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME,
  // VK_KHR_PORTABILITY_ENUMERATION_EXTENSION_NAME,
#if defined(VALIDATION) || defined(DEBUG)
  VK_EXT_DEBUG_UTILS_EXTENSION_NAME,
  VK_EXT_DEBUG_REPORT_EXTENSION_NAME
#endif

};

using namespace glm;

namespace camp
{

// Vertex input description helpers (extracted from render.h vertex structs
// to keep render.h free of Vulkan API dependencies)
inline vk::VertexInputBindingDescription materialVertexBinding()
{
  return vk::VertexInputBindingDescription(0, sizeof(MaterialVertex), vk::VertexInputRate::eVertex);
}

inline std::vector<vk::VertexInputAttributeDescription> materialVertexAttributes(bool count = false)
{
  std::vector<vk::VertexInputAttributeDescription> attributeDescriptions;
  attributeDescriptions.push_back(
      vk::VertexInputAttributeDescription(POSITION_LOCATION, 0, vk::Format::eR32G32B32Sfloat, offsetof(MaterialVertex, position)));

  if (!count) {
    attributeDescriptions.push_back(
        vk::VertexInputAttributeDescription(NORMAL_LOCATION, 0, vk::Format::eR32G32B32Sfloat, offsetof(MaterialVertex, normal)));
    attributeDescriptions.push_back(
        vk::VertexInputAttributeDescription(MATERIAL_LOCATION, 0, vk::Format::eR32Sint, offsetof(MaterialVertex, material)));
  }
  return attributeDescriptions;
}

inline vk::VertexInputBindingDescription colorVertexBinding()
{
  return vk::VertexInputBindingDescription(0, sizeof(ColorVertex), vk::VertexInputRate::eVertex);
}

inline std::vector<vk::VertexInputAttributeDescription> colorVertexAttributes(bool count = false)
{
  std::vector<vk::VertexInputAttributeDescription> attributeDescriptions;
  attributeDescriptions.push_back(
      vk::VertexInputAttributeDescription(POSITION_LOCATION, 0, vk::Format::eR32G32B32Sfloat, offsetof(ColorVertex, position)));

  if (!count) {
    attributeDescriptions.push_back(
        vk::VertexInputAttributeDescription(NORMAL_LOCATION, 0, vk::Format::eR32G32B32Sfloat, offsetof(ColorVertex, normal)));
    attributeDescriptions.push_back(
        vk::VertexInputAttributeDescription(MATERIAL_LOCATION, 0, vk::Format::eR32Sint, offsetof(ColorVertex, material)));
    attributeDescriptions.push_back(
        vk::VertexInputAttributeDescription(COLOR_LOCATION, 0, vk::Format::eR32G32B32A32Sfloat, offsetof(ColorVertex, color)));
  }
  return attributeDescriptions;
}

inline vk::VertexInputBindingDescription pointVertexBinding()
{
  return vk::VertexInputBindingDescription(0, sizeof(PointVertex), vk::VertexInputRate::eVertex);
}

inline std::vector<vk::VertexInputAttributeDescription> pointVertexAttributes(bool count = false)
{
  std::vector<vk::VertexInputAttributeDescription> attributeDescriptions;
  attributeDescriptions.push_back(
      vk::VertexInputAttributeDescription(POSITION_LOCATION, 0, vk::Format::eR32G32B32Sfloat, offsetof(PointVertex, position)));

  // Always include width for points
  attributeDescriptions.push_back(
      vk::VertexInputAttributeDescription(WIDTH_LOCATION, 0, vk::Format::eR32Sfloat, offsetof(PointVertex, width)));

  if (!count) {
    attributeDescriptions.push_back(
        vk::VertexInputAttributeDescription(MATERIAL_LOCATION, 0, vk::Format::eR32Sint, offsetof(PointVertex, material)));
  }
  return attributeDescriptions;
}

// Vertex input trait specializations: map a vertex struct type to its
// Vulkan binding/attribute description free functions.
template<typename V> struct VertexInputTraits;

template<> struct VertexInputTraits<MaterialVertex> {
  static vk::VertexInputBindingDescription binding() { return materialVertexBinding(); }
  static std::vector<vk::VertexInputAttributeDescription> attributes(bool count) { return materialVertexAttributes(count); }
};

template<> struct VertexInputTraits<ColorVertex> {
  static vk::VertexInputBindingDescription binding() { return colorVertexBinding(); }
  static std::vector<vk::VertexInputAttributeDescription> attributes(bool count) { return colorVertexAttributes(count); }
};

template<> struct VertexInputTraits<PointVertex> {
  static vk::VertexInputBindingDescription binding() { return pointVertexBinding(); }
  static std::vector<vk::VertexInputAttributeDescription> attributes(bool count) { return pointVertexAttributes(count); }
};

std::vector<char> readFile(const std::string& filename)
{
  std::ifstream file(filename, std::ios::ate | std::ios::binary);
  if (!file.is_open())
    runtimeError("failed to open file " + filename);

  size_t fileSize = (size_t) file.tellg();
  std::vector<char> buffer(fileSize);

  file.seekg(0);
  file.read(buffer.data(), fileSize);
  file.close();

  return buffer;
}

SwapChainDetails::SwapChainDetails(
  vk::PhysicalDevice gpu,
  vk::SurfaceKHR surface) :
  capabilities {gpu.getSurfaceCapabilitiesKHR(surface)},
  formats {gpu.getSurfaceFormatsKHR(surface)},
  presentModes {gpu.getSurfacePresentModesKHR(surface)}
{ }

SwapChainDetails::operator bool() const
{
  return !formats.empty() && !presentModes.empty();
}

vk::SurfaceFormatKHR
SwapChainDetails::chooseSurfaceFormat() const
{
  for (const auto& availableFormat : formats) {
    if (availableFormat.format == vk::Format::eB8G8R8A8Unorm &&
        availableFormat.colorSpace == vk::ColorSpaceKHR::eSrgbNonlinear) {
      return availableFormat;
    }
  }

  return formats.front();
}

vk::PresentModeKHR
SwapChainDetails::choosePresentMode() const
{
  bool vsync=settings::getSetting<bool>("vsync");
  for (const auto& mode : presentModes) {
    if ((!vsync && mode == vk::PresentModeKHR::eImmediate) ||
        (vsync && mode == vk::PresentModeKHR::eFifo)) {
      return mode;
    }
  }

  return presentModes.front();
}

vk::Extent2D
SwapChainDetails::chooseExtent(size_t width, size_t height) const
{
  if (capabilities.currentExtent.width != std::numeric_limits<uint32_t>::max()) {
    return capabilities.currentExtent;
  }

  auto extent = vk::Extent2D(
    static_cast<uint32_t>(width),
    static_cast<uint32_t>(height)
  );

  extent.width = clamp(
                  extent.width,
                  capabilities.minImageExtent.width,
                  capabilities.maxImageExtent.width
                 );
  extent.height = clamp(
                    extent.height,
                    capabilities.minImageExtent.height,
                    capabilities.maxImageExtent.height
                  );

  return extent;
}

std::uint32_t
SwapChainDetails::chooseImageCount() const
{
  auto imageCount = capabilities.minImageCount + 1;

  if(capabilities.maxImageCount > 0 &&
     imageCount > capabilities.maxImageCount) {
    imageCount = capabilities.maxImageCount;
  }

  return imageCount;
}

void AsyVkRender::setProjection()
{
  AsyRender::setProjection();
  newUniformBuffer = true;
}

void AsyVkRender::updateModelViewData()
{
  AsyRender::updateModelViewData();

  newUniformBuffer = true;
}

// update() now implemented in base class AsyRender::update()

void AsyVkRender::initWindow()
{
  glfwWindow = glfwCreateRenderWindow(Width, Height, title, this);
}

// RenderCallbacks interface implementation
void AsyVkRender::onMouseButton(int button, int action, int mods)
{
    AsyRender::onMouseButton(button, action, mods);
}

void AsyVkRender::onFramebufferResize(int width, int height)
{
    AsyRender::onFramebufferResize(width, height);
}

void AsyVkRender::onScroll(double xoffset, double yoffset)
{
    AsyRender::onScroll(xoffset, yoffset);
}

void AsyVkRender::onCursorPos(double xpos, double ypos)
{
    AsyRender::onCursorPos(xpos, ypos);
}

void AsyVkRender::onKey(int key, int scancode, int action, int mods)
{
    AsyRender::onKey(key, scancode, action, mods);
}

void AsyVkRender::onWindowFocus(int focused)
{
    if (focused) {
        // Window gained focus: might need to recreate swapchain
        recreatePipeline = true;
    }
}

void AsyVkRender::onClose()
{
    exitHandler(0);
}

void AsyVkRender::updateHandler(int) {
  // Call base class implementation for common functionality
  AsyRender::updateHandler();

  // Vulkan-specific additions
  if(device)
    device->waitIdle();
  framebufferResized=true;
}

AsyVkRender::~AsyVkRender()
{
  if (View) {
    ::glfwDestroyWindow(getGLFWWindow());
    glfwWindow = nullptr;
  }

  glslang::FinalizeProcess();
}

void AsyVkRender::render(RenderFunctionArgs const& args)
{
#if !defined(_WIN32)
  setenv("XMODIFIERS","",true);
#endif

  copyRenderArgs(args);

#ifdef HAVE_PTHREAD
  static bool initializedView=false;
  if(vkinitialize)
    Fitscreen=1;
#endif

  if(!(initialized && interact::interactive)) {
    antialias=settings::getSetting<Int>("antialias") > 1;

    Aspect = args.width/args.height;

    // On macOS with llvmpipe (no Metal), don't create a GLFW window -
    // there's no display backend to present to. Use offscreen rendering.
    if (headlessRenderer) {
        View = false;
    }

    initDisplay(args.width, args.height);
  }

  maxFragments = 0;
  havewindow = View && threads;

  clearMaterials();
  shouldUpdateBuffers = true;
  initialized=true;

#ifdef HAVE_PTHREAD
  if(threads && initializedView) {
    if(View) {
      // Called from asymain thread, main thread handles rendering
      hideWindow=false;
      threadMgr.messageQueue.enqueue(RendererMessage::updateRenderer);
    } else readyAfterExport=queueExport=true;
    return;
  }
#endif

  GPUcompress=settings::getSetting<bool>("GPUcompress");

  localSize=settings::getSetting<Int>("GPUlocalSize");
  checkpow2(localSize,"GPUlocalSize");
  blockSize=settings::getSetting<Int>("GPUblockSize");
  checkpow2(blockSize,"GPUblockSize");
  groupSize=localSize*blockSize;

  if(vkinitialize) {
    interlock=settings::getSetting<bool>("GPUinterlock");
    fxaa=settings::getSetting<bool>("fxaa");
    srgb=settings::getSetting<bool>("srgb");

    ibl=settings::getSetting<bool>("ibl");
  }

  if(View) {
    if(!glfwWindow)
      initWindow();
    if(!getSetting<bool>("fitscreen"))
      Fitscreen=0;
    fitscreen();
#ifdef HAVE_PTHREAD
    initializedView=true;
#endif
  }

  if(vkinitialize) {
    vkinitialize=false;
    initVulkan();
  }

  readyForUpdate=true;
  mainLoop();
}

void AsyVkRender::initVulkan()
{
#ifdef __APPLE__
  // Point the Vulkan loader to the bundled MoltenVK ICD if available
  {
    char exePath[PATH_MAX];
    uint32_t size = sizeof(exePath);
    if (_NSGetExecutablePath(exePath, &size) == 0) {
      char realPath[PATH_MAX];
      if (realpath(exePath, realPath)) {
        std::string exeDir(realPath);
        size_t lastSlash = exeDir.rfind('/');
        if (lastSlash != std::string::npos)
          exeDir = exeDir.substr(0, lastSlash);
        std::string icdPath = exeDir + "/lib/MoltenVK_icd.json";
        struct stat st;
        if (stat(icdPath.c_str(), &st) == 0 && S_ISREG(st.st_mode)) {
          setenv("VK_ICD_FILENAMES", icdPath.c_str(), 0);
          setenv("VK_DRIVER_FILES", icdPath.c_str(), 0);
        }
      }
    }
  }

  setenv("MVK_CONFIG_LOG_LEVEL","1",false);

  // Use smallest memory footprint during command buffer encoding
  setenv("MVK_CONFIG_PREFILL_METAL_COMMAND_BUFFERS", "2", true);

  // Disable Metal argument buffers (0) to avoid SSBO descriptor indexing
  // issues with transparent rendering.
  setenv("MVK_CONFIG_USE_METAL_ARGUMENT_BUFFERS", "0", true);

  setenv("MVK_CONFIG_PERFORMANCE_TRACKING", "0", true);
#endif

  // Vulkan dispatch loader is already initialized in rendererloader.cc
  // via dlsym(RTLD_DEFAULT, "vkGetInstanceProcAddr"), so no need to
  // reference the raw vkGetInstanceProcAddr symbol here (avoids -lvulkan).

  if (!glslang::InitializeProcess())
    runtimeError("failed to initialize glslang");

  maxFramesInFlight=View ? settings::getSetting<Int>("maxFramesInFlight") : 1;
  frameObjects.resize(maxFramesInFlight);

  if (settings::verbose > 1) {
    std::cout << "Using " << maxFramesInFlight
              << " maximum frame(s) in flight" << std::endl;
  }
  createInstance();
  createDebugMessenger();
  if (View) createSurface();
  pickPhysicalDevice();

  if(isNVIDIA30xx(physicalDevice.getProperties().deviceName))
    interlock = false;

  fpu_trap(false); // Work around FE_INVALID.

  createLogicalDevice();
  createAllocator();
  createCommandPools();
  createCommandBuffers();
  if (View) createSwapChain();
  else createOffscreenBuffers();

  if (fxaa)
  {
    setupPostProcessingComputeParameters();
  }
  createImageViews();
  createSyncObjects();

  createDescriptorSetLayout();
  createComputeDescriptorSetLayout();

  createBuffers();

  if (ibl) {
    initIBL();
  }

  createDescriptorPool();
  createComputeDescriptorPool();
  createDescriptorSets();

  createImmediateRenderTargets();

  if (fxaa) {
    transitionFXAAImages();
  }
  writeDescriptorSets();
  writeMaterialAndLightDescriptors();

  createCountRenderPass();
  createGraphicsRenderPass();
  createGraphicsPipelineLayout();
  createGraphicsPipelines();

  createComputePipelines(); // gpu indexing + post processing

  fpu_trap(settings::trap());

  createAttachments();
  createFramebuffers();
  createExportResources();
}

void AsyVkRender::recreateSwapChain()
{
  device->waitIdle();

  fpu_trap(false); // Work around FE_INVALID
  try {
    // Reset timeline semaphore values to avoid timeout issues
    currentTimelineValue = 0;
    for (auto& frameObj : frameObjects) {
      frameObj.timelineValue = 0;
      frameObj.computeTimelineValue = 0;  // Also reset compute timeline value
    }

    // Reset the timeline semaphore and recreate it
    renderTimelineSemaphore.reset();
    renderTimelineSemaphore = createTimelineSemaphore(0);

    resetDepth=true;
    createSwapChain();

    if (fxaa)
      setupPostProcessingComputeParameters();

    createDependentBuffers();
    createImmediateRenderTargets();

    if (fxaa) {
      transitionFXAAImages();

      // Recreate the post-process descriptor sets from scratch
      postProcessDescSet.clear();

      // Reallocate descriptor sets with the new layout
      std::vector<vk::DescriptorSetLayout> postProcessDescLayouts(backbufferImages.size(), *postProcessDescSetLayout);
      try {
        postProcessDescSet = device->allocateDescriptorSetsUnique({*postProcessDescPool, VEC_VIEW(postProcessDescLayouts)});
      } catch (const std::exception& e) {
        runtimeError("Failed to allocate post-process descriptor sets: " +
                     std::string(e.what()));
      }
    }

    createImageViews();
    createSyncObjects();

    writeDescriptorSets();
    writeMaterialAndLightDescriptors();

    createAttachments();
    createCountRenderPass();
    createGraphicsRenderPass();
    createGraphicsPipelines();
    createComputePipelines();
    createFramebuffers();
    createExportResources();
  } catch (const vk::OutOfDeviceMemoryError& e) {
    outOfMemory();
  }

  fpu_trap(settings::trap());
  redisplay=true;
  waitEvent=false;
}

void AsyVkRender::initializeSwapChainIfNeeded()
{
  device->waitIdle();

  if (!surface) {
    createSurface();
  }

  auto presentFamilyIndices = findQueueFamilies(physicalDevice, &*surface);

  createSwapChain();

  if (presentFamilyIndices.presentQueueFamilyFound) {
    presentQueue = device->getQueue(presentFamilyIndices.presentQueueFamily, 0);
  }

  createImageViews();
  createFramebuffers();
  createImmediateRenderTargets();
  transitionFXAAImages();

  currentTimelineValue = 0;
  for (auto& frameObj : frameObjects) {
    frameObj.timelineValue = 0;
    frameObj.computeTimelineValue = 0;
  }

  renderTimelineSemaphore.reset();
  renderTimelineSemaphore = createTimelineSemaphore(0);

  recreatePipeline = true;
}

void AsyVkRender::transitionFXAAImages()
{
  auto cmdBuffer = beginSingleCommands();
  // Transition immediate render target images
  for (size_t i = 0; i < immediateRenderTargetImgs.size(); i++) {
    transitionImageLayout(
      cmdBuffer,
      immediateRenderTargetImgs[i].getImage(),
      vk::AccessFlagBits::eNone,
      vk::AccessFlagBits::eShaderRead,
      vk::ImageLayout::eUndefined,
      vk::ImageLayout::eGeneral,
      vk::PipelineStageFlagBits::eTopOfPipe,
      vk::PipelineStageFlagBits::eComputeShader,
      vk::ImageSubresourceRange(vk::ImageAspectFlagBits::eColor, 0, 1, 0, 1)
      );
  }

  // Transition pre-presentation images
  for (size_t i = 0; i < prePresentationImages.size(); i++) {
    transitionImageLayout(
      cmdBuffer,
      prePresentationImages[i].getImage(),
      vk::AccessFlagBits::eNone,
      vk::AccessFlagBits::eShaderWrite,
      vk::ImageLayout::eUndefined,
      vk::ImageLayout::eGeneral,
      vk::PipelineStageFlagBits::eTopOfPipe,
      vk::PipelineStageFlagBits::eComputeShader,
      vk::ImageSubresourceRange(vk::ImageAspectFlagBits::eColor, 0, 1, 0, 1)
      );
  }
  endSingleCommands(cmdBuffer);
}

void AsyVkRender::zeroTransparencyBuffers()
{
  auto const clearCmdBuffer=beginSingleCommands();
  zeroBuffer(clearCmdBuffer,globalSumBf.getBuffer());
  zeroBuffer(clearCmdBuffer,opaqueDepthBf.getBuffer());
  if(GPUcompress)
    zeroBuffer(clearCmdBuffer,indexBf.getBuffer());
  zeroBuffer(clearCmdBuffer,countBf.getBuffer());
  endSingleCommands(clearCmdBuffer);
}

std::set<std::string> AsyVkRender::getInstanceExtensions()
{
  std::set<std::string> extensions;
  auto availableExtensions = vk::enumerateInstanceExtensionProperties();
  for (auto& extension : availableExtensions) {
    extensions.insert(extension.extensionName);
  }
  return extensions;
}

std::set<std::string> AsyVkRender::getDeviceExtensions(vk::PhysicalDevice& device)
{
  std::set<std::string> extensions;
  auto availableExtensions = device.enumerateDeviceExtensionProperties();
  for (auto& extension : availableExtensions) {
    extensions.insert(extension.extensionName);
  }
  return extensions;
}

std::vector<const char*> AsyVkRender::getRequiredInstanceExtensions()
{
  uint32_t glfwExtensionCount;
  auto const glfwExtensions = glfwGetRequiredInstanceExtensions(&glfwExtensionCount);
  std::vector<const char*> extensions(glfwExtensions, glfwExtensions + glfwExtensionCount);

  for(auto& extension : instanceExtensions) {
    extensions.emplace_back(extension);
  }

  return extensions;
}

void AsyVkRender::createInstance()
{
  auto appInfo = vk::ApplicationInfo(
    PACKAGE_STRING,
    VK_MAKE_VERSION(1, 0, 0),
    "No Engine",
    VK_MAKE_VERSION(1, 0, 0),
    apiVersion
  );
  auto supportedExtensions = getInstanceExtensions();
  auto supportedLayers = vk::enumerateInstanceLayerProperties();
  auto extensions = getRequiredInstanceExtensions();

  auto isLayerSupported = [supportedLayers](std::string layerName) {
    return std::find_if(
      supportedLayers.begin(),
      supportedLayers.end(),
      [layerName](vk::LayerProperties const& layer) {
        return layer.layerName.data() == layerName;
      }) != supportedLayers.end();
  };

  auto isExtensionSupported = [supportedExtensions](std::string extension) {
    return std::find_if(
      supportedExtensions.begin(),
      supportedExtensions.end(),
      [extension](std::string const& supportedExt) {
        return supportedExt == extension;
      }) != supportedExtensions.end();
  };

#ifdef VALIDATION
  if (isLayerSupported(VALIDATION_LAYER)) {
    validationLayers.emplace_back(VALIDATION_LAYER);
  } else if (settings::verbose > 1) {
    std::cout << "Validation layers are not supported by the current Vulkan instance" << std::endl;
  }
#endif

  std::vector<const char*> all_extensions;
  all_extensions.reserve(supportedExtensions.size());

  for (const auto& str : supportedExtensions) {
      all_extensions.push_back(str.c_str());
  }

  auto const instanceCI = vk::InstanceCreateInfo(
#if defined(__APPLE__)
    vk::InstanceCreateFlagBits::eEnumeratePortabilityKHR,
#else
    {},
#endif
    &appInfo,
    VEC_VIEW(validationLayers),
    VEC_VIEW(all_extensions)
  );
#ifdef VALIDATION
#ifndef _WIN32
  // Preload the validation layer into global scope so its function pointer
  // chain works correctly even though libasyvulkan.so was loaded with RTLD_LOCAL.
  void *layerHandle = dlopen("libVkLayer_khronos_validation.so", RTLD_GLOBAL | RTLD_NOW);
  if (!layerHandle) {
    std::cerr << "Warning: failed to preload validation layer: "
              << dlerror() << std::endl;
  }
#endif
#endif

  instance = vk::createInstanceUnique(instanceCI);
  VULKAN_HPP_DEFAULT_DISPATCHER.init(*instance);
}

void AsyVkRender::createDebugMessenger()
{
#if defined(VALIDATION)
  vk::DebugUtilsMessageSeverityFlagsEXT severityFlags(vk::DebugUtilsMessageSeverityFlagBitsEXT::eError);
  vk::DebugUtilsMessageTypeFlagsEXT typeFlags(vk::DebugUtilsMessageTypeFlagBitsEXT::eValidation);
  if (settings::verbose > 2)
  {
    severityFlags |= vk::DebugUtilsMessageSeverityFlagBitsEXT::eWarning;
    typeFlags |= vk::DebugUtilsMessageTypeFlagBitsEXT::eGeneral;
  }
  if (settings::verbose > 2)
  {
    severityFlags |= vk::DebugUtilsMessageSeverityFlagBitsEXT::eInfo;
    typeFlags |= typeFlags |= vk::DebugUtilsMessageTypeFlagBitsEXT::ePerformance;
  }
  if (settings::verbose > 2)
  {
    severityFlags |= vk::DebugUtilsMessageSeverityFlagBitsEXT::eVerbose;
  }

  auto const debugCreateInfo= vk::DebugUtilsMessengerCreateInfoEXT(
          {}, severityFlags, typeFlags,
          [](vk::DebugUtilsMessageSeverityFlagBitsEXT msgSeverity,
             vk::DebugUtilsMessageTypeFlagsEXT msgType,
             vk::DebugUtilsMessengerCallbackDataEXT const* pCallbackData,
             void* pUserData) -> vk::Bool32 {
            switch (msgSeverity) {
              case vk::DebugUtilsMessageSeverityFlagBitsEXT::eInfo:
                cerr << "validation layer: " << pCallbackData->pMessage
                     << std::endl;
                break;
              case vk::DebugUtilsMessageSeverityFlagBitsEXT::eVerbose:
                cerr << "[VERBOSE] validation layer: "
                     << pCallbackData->pMessage << std::endl;
                break;
              case vk::DebugUtilsMessageSeverityFlagBitsEXT::eWarning:
              case vk::DebugUtilsMessageSeverityFlagBitsEXT::eError:
                reportWarning(pCallbackData->pMessage);
                break;
            }

            return vk::False;
          },
          this
  );
  debugUtilsMsg=
          instance->createDebugUtilsMessengerEXTUnique(
          debugCreateInfo);
#endif
}

void AsyVkRender::createSurface()
{
#if defined(_WIN32)
  vk::Win32SurfaceCreateInfoKHR createInfo = {};
  createInfo.hwnd = glfwGetWin32Window(getGLFWWindow());
  createInfo.hinstance = GetModuleHandleA(nullptr);

  vk::SurfaceKHR tmpSurface;

  vkutils::checkVkResult(instance->createWin32SurfaceKHR(
    &createInfo,
    nullptr,
    &tmpSurface
  ));

  surface=vk::UniqueSurfaceKHR(tmpSurface);
#else
  VkSurfaceKHR surfaceTmp;
  if (glfwCreateWindowSurface(*instance, getGLFWWindow(), nullptr, &surfaceTmp) != VK_SUCCESS)
    runtimeError("failed to create window surface");
  surface=vk::UniqueSurfaceKHR(surfaceTmp, *instance);
#endif
}

void AsyVkRender::createAllocator()
{
  VmaVulkanFunctions vkFuncs = {};
  vkFuncs.vkGetInstanceProcAddr = VULKAN_HPP_DEFAULT_DISPATCHER.vkGetInstanceProcAddr;
  vkFuncs.vkGetDeviceProcAddr = VULKAN_HPP_DEFAULT_DISPATCHER.vkGetDeviceProcAddr;
  vkFuncs.vkGetBufferMemoryRequirements2KHR = VULKAN_HPP_DEFAULT_DISPATCHER.vkGetBufferMemoryRequirements2 ? VULKAN_HPP_DEFAULT_DISPATCHER.vkGetBufferMemoryRequirements2 : VULKAN_HPP_DEFAULT_DISPATCHER.vkGetBufferMemoryRequirements2KHR;
  vkFuncs.vkGetImageMemoryRequirements2KHR = VULKAN_HPP_DEFAULT_DISPATCHER.vkGetImageMemoryRequirements2 ? VULKAN_HPP_DEFAULT_DISPATCHER.vkGetImageMemoryRequirements2 : VULKAN_HPP_DEFAULT_DISPATCHER.vkGetImageMemoryRequirements2KHR;
  vkFuncs.vkBindBufferMemory2KHR = VULKAN_HPP_DEFAULT_DISPATCHER.vkBindBufferMemory2 ? VULKAN_HPP_DEFAULT_DISPATCHER.vkBindBufferMemory2 : VULKAN_HPP_DEFAULT_DISPATCHER.vkBindBufferMemory2KHR;
  vkFuncs.vkBindImageMemory2KHR = VULKAN_HPP_DEFAULT_DISPATCHER.vkBindImageMemory2 ? VULKAN_HPP_DEFAULT_DISPATCHER.vkBindImageMemory2 : VULKAN_HPP_DEFAULT_DISPATCHER.vkBindImageMemory2KHR;

  VmaAllocatorCreateInfo createInfo = {};
  createInfo.vulkanApiVersion=apiVersion;
  createInfo.physicalDevice = physicalDevice;
  createInfo.device = *device;
  createInfo.instance = *instance;
  createInfo.pVulkanFunctions = &vkFuncs;

  allocator = vma::cxx::UniqueAllocator(createInfo);
}

void AsyVkRender::pickPhysicalDevice()
{
  bool remote=false;

  if(View) {
    char *display=getenv("DISPLAY");
    remote=display ? string(display).find(":") != 0 : false;
  }

  Int device=getSetting<Int>("device");

  ssize_t count=0;
  bool showDevices=settings::verbose > 1;
  if(device >= 0 || showDevices) {
    for(auto& dev: instance->enumeratePhysicalDevices()) {
      if(showDevices)
        std::cerr << "Device " << count << ": " << dev.getProperties().deviceName << std::endl;
      count++;
    }
  }

  bool software=View && remote;

  if(device >= 0 && device < count) {
    physicalDevice=instance->enumeratePhysicalDevices()[device];
    if(software && physicalDevice.getProperties().deviceType !=
       vk::PhysicalDeviceType::eCpu)
      runtimeError("remote onscreen rendering requires the llvmpipe device");
  } else {
    auto const getDeviceScore =
      [this,software](vk::PhysicalDevice& device) -> size_t
      {
        size_t score = 0u;

        if (!isDeviceSuitable(device))
          return score;

        auto const msaa = getMaxMSAASamples(device).second;

        switch (msaa)
          {
            case vk::SampleCountFlagBits::e64:
            case vk::SampleCountFlagBits::e32:
            case vk::SampleCountFlagBits::e16:

              score += 10;
              break;

            case vk::SampleCountFlagBits::e8:
            case vk::SampleCountFlagBits::e4:
            case vk::SampleCountFlagBits::e2:

              score += 5;
              break;

            default:

              break;
          }

        auto const props = device.getProperties();

        if(vk::PhysicalDeviceType::eDiscreteGpu == props.deviceType) {
          if(software) return 0;
          score += 10;
        } else if(vk::PhysicalDeviceType::eIntegratedGpu == props.deviceType) {
          if(software) return 0;
          score += 5;
        } else if(vk::PhysicalDeviceType::eCpu == props.deviceType &&
                  software) {
          // Force using software renderer for remote onscreen rendering
          score += 100;
        }

        return score;
      };

    std::pair<size_t, vk::PhysicalDevice> highestDeviceScore;

    for (auto & dev: instance->enumeratePhysicalDevices())
      {
        auto const score = getDeviceScore(dev);

        if (nullptr == highestDeviceScore.second
            || score > highestDeviceScore.first)
          highestDeviceScore = std::make_pair(score, dev);
      }

    if (0 == highestDeviceScore.first)
      runtimeError("no suitable GPUs");

    physicalDevice = highestDeviceScore.second;
  }

  if(settings::verbose > 1)
    cout << "Using device " << physicalDevice.getProperties().deviceName
         << endl;

  // Software renderers (llvmpipe, vulkan-software) JIT-compile shaders to native
  // code on first use, causing cold-start delays of several seconds.  Use a
  // longer Vulkan wait timeout for those devices so the first frame doesn't
  // trigger a spurious VK_TIMEOUT error.  Hardware GPUs get a shorter timeout
  // so that genuine hangs are detected promptly.
  if (physicalDevice.getProperties().deviceType == vk::PhysicalDeviceType::eCpu) {
    vkTimeout = 30'000'000'000ULL; // 30 seconds for software renderers
  } else {
    vkTimeout = 1'000'000'000ULL;  // 1 second for hardware GPUs
  }

  std::uint32_t nSamples;

  std::tie(nSamples, msaaSamples) = getMaxMSAASamples(physicalDevice);

  if(settings::verbose > 1 && msaaSamples != vk::SampleCountFlagBits::e1)
    cout << "Multisampling enabled with sample width " << nSamples
         << endl;
}

std::pair<std::uint32_t, vk::SampleCountFlagBits>
AsyVkRender::getMaxMSAASamples( vk::PhysicalDevice & gpu )
{
  // FXAA means we disable MSAA
  if (settings::getSetting<bool>("fxaa"))
  {
    return std::make_pair(1, vk::SampleCountFlagBits::e1);
  }

  vk::PhysicalDeviceProperties props { };

  gpu.getProperties( &props );

  auto const count = props.limits.framebufferColorSampleCounts & props.limits.framebufferDepthSampleCounts;
  auto const maxSamples = settings::getSetting<Int>("multisample");

  if (count & vk::SampleCountFlagBits::e64 && maxSamples >= 64)
    return std::make_pair(64, vk::SampleCountFlagBits::e64);
  if (count & vk::SampleCountFlagBits::e32 && maxSamples >= 32)
    return std::make_pair(32, vk::SampleCountFlagBits::e32);
  if (count & vk::SampleCountFlagBits::e16 && maxSamples >= 16)
    return std::make_pair(16, vk::SampleCountFlagBits::e16);
  if (count & vk::SampleCountFlagBits::e8 && maxSamples >= 8)
    return std::make_pair(8, vk::SampleCountFlagBits::e8);
  if (count & vk::SampleCountFlagBits::e4 && maxSamples >= 4)
    return std::make_pair(4, vk::SampleCountFlagBits::e4);
  if (count & vk::SampleCountFlagBits::e2 && maxSamples >= 2)
    return std::make_pair(2, vk::SampleCountFlagBits::e2);

  return std::make_pair(1, vk::SampleCountFlagBits::e1);
}

QueueFamilyIndices AsyVkRender::findQueueFamilies(vk::PhysicalDevice& physicalDevice, vk::SurfaceKHR* surface)
{
  QueueFamilyIndices indices;

  auto queueFamilies = physicalDevice.getQueueFamilyProperties();

  for (auto u = 0u; u < queueFamilies.size(); u++)
  {
    auto const & family = queueFamilies[u];

    if (family.queueFlags & vk::QueueFlagBits::eGraphics) {
      indices.renderQueueFamily = u,
      indices.renderQueueFamilyFound = true;

      if (surface != nullptr && VK_FALSE != physicalDevice.getSurfaceSupportKHR(u, *surface)) {
        indices.presentQueueFamily = u;
        indices.presentQueueFamilyFound = true;
      }
    }

    if (family.queueFlags & vk::QueueFlagBits::eTransfer) {
      indices.transferQueueFamily = u,
      indices.transferQueueFamilyFound = true;
    }
  }

  return indices;
}

bool AsyVkRender::isDeviceSuitable(vk::PhysicalDevice& device)
{
  auto const indices = findQueueFamilies(device, View ? &*surface : nullptr);
  if (!indices.transferQueueFamilyFound
      || !indices.renderQueueFamilyFound
      || !(indices.presentQueueFamilyFound || !View))
      return false;

  if (!checkDeviceExtensionSupport(device))
    return false;

  auto const features = device.getFeatures();

  if (!View) {
    return features.samplerAnisotropy;
  }

  auto const swapDetails = SwapChainDetails(device, *surface);

  if (View && !swapDetails) {
    return false;
  }

  return features.samplerAnisotropy;
}

bool AsyVkRender::checkDeviceExtensionSupport(vk::PhysicalDevice& device)
{
  auto extensions = device.enumerateDeviceExtensionProperties();
  std::set<std::string> requiredExtensions(deviceExtensions.begin(), deviceExtensions.end());
  requiredExtensions.insert(VK_KHR_SWAPCHAIN_EXTENSION_NAME);

  for (auto& extension : extensions) {
    requiredExtensions.erase(extension.extensionName);
  }
  return requiredExtensions.empty();
}

void AsyVkRender::createLogicalDevice()
{
  auto const supportedDeviceExtensions = getDeviceExtensions(physicalDevice);
  std::vector<const char*> extensions(deviceExtensions.begin(), deviceExtensions.end());
  bool usePortability = false;

  // Check for timeline semaphore support
  vk::PhysicalDeviceTimelineSemaphoreFeatures timelineSemaphoreFeatures;
  vk::PhysicalDeviceFeatures2 deviceFeatures2;
  deviceFeatures2.pNext = &timelineSemaphoreFeatures;

  physicalDevice.getFeatures2(&deviceFeatures2);

  extensions.push_back(VK_KHR_TIMELINE_SEMAPHORE_EXTENSION_NAME);
  extensions.push_back(VK_KHR_SYNCHRONIZATION_2_EXTENSION_NAME);

  if (supportedDeviceExtensions.find(VK_KHR_PORTABILITY_SUBSET_EXTENSION_NAME) != supportedDeviceExtensions.end()) {
    extensions.push_back(VK_KHR_PORTABILITY_SUBSET_EXTENSION_NAME);
    usePortability = true;
  }
  extensions.push_back(VK_KHR_SWAPCHAIN_EXTENSION_NAME);
  if (interlock) {
    if (supportedDeviceExtensions.find(VK_EXT_FRAGMENT_SHADER_INTERLOCK_EXTENSION_NAME) == supportedDeviceExtensions.end()) {
      interlock=false;
    }
    else {
      extensions.emplace_back(VK_EXT_FRAGMENT_SHADER_INTERLOCK_EXTENSION_NAME);
    }
  }

  if (supportedDeviceExtensions.find(VK_KHR_GET_MEMORY_REQUIREMENTS_2_EXTENSION_NAME) != supportedDeviceExtensions.end()) {
    extensions.push_back(VK_KHR_GET_MEMORY_REQUIREMENTS_2_EXTENSION_NAME);
    if (settings::verbose > 1)
      std::cout << "Using logical device memory requirements extension"
                << std::endl;
  }

  if (supportedDeviceExtensions.find(VK_KHR_BIND_MEMORY_2_EXTENSION_NAME) != supportedDeviceExtensions.end()) {
    extensions.push_back(VK_KHR_BIND_MEMORY_2_EXTENSION_NAME);
  }

#if defined(DEBUG)
  auto const hasDebugMarkerExt=
    supportedDeviceExtensions.find(VK_EXT_DEBUG_MARKER_EXTENSION_NAME) != supportedDeviceExtensions.end();

  if (hasDebugMarkerExt)
  {
    hasDebugMarker=true;
    extensions.emplace_back(VK_EXT_DEBUG_MARKER_EXTENSION_NAME);
  }
  else
  {
    reportWarning("Debug marker extension not supported");
  }
#endif

  queueFamilyIndices = findQueueFamilies(physicalDevice, View ? &*surface : nullptr);

  std::vector<vk::DeviceQueueCreateInfo> queueCIs;
  std::set<uint32_t> uniqueQueueFamilies = {
    queueFamilyIndices.transferQueueFamily,
    queueFamilyIndices.renderQueueFamily
  };

  if (queueFamilyIndices.presentQueueFamilyFound) {
    uniqueQueueFamilies.emplace(queueFamilyIndices.presentQueueFamily);
  }

  float queuePriority = 1.0f;
  for(auto queueFamily : uniqueQueueFamilies) {
    vk::DeviceQueueCreateInfo queueCI(vk::DeviceQueueCreateFlags(), queueFamily, 1, &queuePriority);
    queueCIs.push_back(queueCI);
  }

  // Build the pNext chain for device features.
  // Start with timeline features if they are supported.
  void * extensionChain = nullptr;
  timelineSemaphoreFeatures.pNext = extensionChain;
  extensionChain = &timelineSemaphoreFeatures;

  auto portabilityFeatures = vk::PhysicalDevicePortabilitySubsetFeaturesKHR(
    false,
    true
  );
  auto interlockFeatures = vk::PhysicalDeviceFragmentShaderInterlockFeaturesEXT(
    true,
    true,
    false
  );
  auto resolveExtension = vk::PhysicalDeviceDepthStencilResolveProperties(
    vk::ResolveModeFlagBits::eMin,
    vk::ResolveModeFlagBits::eMin
  );
  auto props = vk::PhysicalDeviceProperties2(
    {},
    &resolveExtension
  );

  vk::PhysicalDeviceFeatures deviceFeatures;
  deviceFeatures.fillModeNonSolid = true;
  // Needed for some Mac machines.
  deviceFeatures.fragmentStoresAndAtomics = true;
//  deviceFeatures.shaderStorageImageWriteWithoutFormat=true;
//  deviceFeatures.shaderStorageImageReadWithoutFormat=true;

  physicalDevice.getProperties2(&props);

  vk::PhysicalDeviceSynchronization2Features synchronization2Features;
  synchronization2Features.sType = vk::StructureType::ePhysicalDeviceSynchronization2Features;
  synchronization2Features.synchronization2 = VK_TRUE;
  synchronization2Features.pNext = extensionChain;
  extensionChain = &synchronization2Features;

  if (usePortability) {
    portabilityFeatures.pNext = extensionChain;
    extensionChain = &portabilityFeatures;
  }

  if (interlock) {
    interlockFeatures.pNext = extensionChain;
    extensionChain = &interlockFeatures;
  }

  auto deviceCI = vk::DeviceCreateInfo(
    vk::DeviceCreateFlags(),
    VEC_VIEW(queueCIs),
    VEC_VIEW(validationLayers),
    VEC_VIEW(extensions),
    &deviceFeatures,
    extensionChain
  );

  device = physicalDevice.createDeviceUnique(deviceCI, nullptr);
  VULKAN_HPP_DEFAULT_DISPATCHER.init(*device);

  transferQueue = device->getQueue(queueFamilyIndices.transferQueueFamily, 0);
  renderQueue = device->getQueue(queueFamilyIndices.renderQueueFamily, 0);
  if (queueFamilyIndices.presentQueueFamilyFound) {
    presentQueue = device->getQueue(queueFamilyIndices.presentQueueFamily, 0);
  }
}

vk::UniqueSemaphore AsyVkRender::createTimelineSemaphore(uint64_t initialValue) {
  // Create the timeline semaphore type info
  vk::SemaphoreTypeCreateInfo timelineCreateInfo(
    vk::SemaphoreType::eTimeline,
    initialValue
  );

  // Create the semaphore with the timeline type
  vk::SemaphoreCreateInfo createInfo({}, &timelineCreateInfo);

  return device->createSemaphoreUnique(createInfo);
}

void AsyVkRender::waitForTimelineSemaphore(vk::Semaphore semaphore, uint64_t value, uint64_t timeout)
{
  vk::SemaphoreWaitInfo waitInfo(
    {},
    1, &semaphore,
    &value
  );

  uint64_t retryTimeout = 0.1*timeout;
  uint64_t maxRetries = 10;
  uint64_t retryCount = 0;

  while (retryCount < maxRetries) {
    vk::Result result = device->waitSemaphores(waitInfo, retryTimeout);

    if (result == vk::Result::eSuccess) {
      return;
    }

    if (result == vk::Result::eTimeout) {
      retryCount++;
      retryTimeout *= 2; // Exponential backoff
      if (retryTimeout > timeout) retryTimeout = timeout;

      // Small sleep to avoid busy waiting
      std::this_thread::sleep_for(std::chrono::microseconds(100));
    } else {
      // Other error - this should be reported
      runtimeError("Timeline semaphore wait failed with result " +
                   std::to_string(static_cast<int>(result)));
    }
  }

  // If we've exhausted all retries, then force a full device sync
  cerr << "warning: Timeline semaphore wait timed out after " << 1.0e-9*timeout << " seconds" << endl;

  // Force full synchronization
  try {
    device->waitIdle();
    currentTimelineValue = 0;
    // Reset all frame objects' timeline values
    for (auto& frameObj : frameObjects) {
      frameObj.timelineValue = 0;
      frameObj.computeTimelineValue = 0;
    }
  } catch (const std::exception& e) {
    cerr << "Error during device waitIdle after timeout: " << e.what() << endl;
  }
}

void AsyVkRender::transitionImageLayout(vk::CommandBuffer cmd,
                             vk::Image image,
                             vk::AccessFlags srcAccessMask,
                             vk::AccessFlags dstAccessMask,
                             vk::ImageLayout oldImageLayout,
                             vk::ImageLayout newImageLayout,
                             vk::PipelineStageFlags srcStageMask,
                             vk::PipelineStageFlags dstStageMask,
                             vk::ImageSubresourceRange subresourceRange)
{
  auto barrier = vk::ImageMemoryBarrier(
    srcAccessMask,
    dstAccessMask,
    oldImageLayout,
    newImageLayout,
    VK_QUEUE_FAMILY_IGNORED, VK_QUEUE_FAMILY_IGNORED,
    image,
    subresourceRange
  );

  cmd.pipelineBarrier(srcStageMask, dstStageMask, { }, 0, nullptr, 0, nullptr, 1, &barrier);
}

void AsyVkRender::createExportResources()
{
  auto const cmdInfo = vk::CommandBufferAllocateInfo(
    *renderCommandPool,
    vk::CommandBufferLevel::ePrimary,
    1
  );

  exportCommandBuffer = std::move(device->allocateCommandBuffersUnique(cmdInfo)[0]);
  exportFence = device->createFenceUnique(vk::FenceCreateInfo(vk::FenceCreateFlagBits::eSignaled));
}

void AsyVkRender::createSwapChain()
{
  if (!surface)
    createSurface();

  auto const swapDetails = SwapChainDetails(physicalDevice, *surface);
  auto && format = swapDetails.chooseSurfaceFormat();
  auto && extent = swapDetails.chooseExtent(Width,Height);

  vk::ImageUsageFlags swapchainImgUsageFlags =
          vk::ImageUsageFlagBits::eColorAttachment
          | vk::ImageUsageFlagBits::eTransferSrc;

  if (fxaa)
  {
    swapchainImgUsageFlags |= vk::ImageUsageFlagBits::eTransferDst;
  }

  vk::SwapchainCreateInfoKHR swapchainCI = vk::SwapchainCreateInfoKHR(
    vk::SwapchainCreateFlagsKHR(),
    *surface,
    swapDetails.chooseImageCount(),
    format.format,
    format.colorSpace,
    extent,
    1,
    swapchainImgUsageFlags,
    vk::SharingMode::eExclusive,
    0,
    nullptr,
    swapDetails.capabilities.currentTransform,
    vk::CompositeAlphaFlagBitsKHR::eOpaque,
    swapDetails.choosePresentMode(),
    VK_TRUE,
    nullptr,
    nullptr
  );

  if (*swapChain) {
    swapchainCI.oldSwapchain = *swapChain;
  }

  if (queueFamilyIndices.renderQueueFamily != queueFamilyIndices.presentQueueFamily) {
    static std::array<std::uint32_t, 2> indices
    {
      queueFamilyIndices.renderQueueFamily,
      queueFamilyIndices.presentQueueFamily
    };

    swapchainCI.imageSharingMode = vk::SharingMode::eConcurrent;
    swapchainCI.queueFamilyIndexCount = indices.size();
    swapchainCI.pQueueFamilyIndices= indices.data();
  }

  swapChain = device->createSwapchainKHRUnique(swapchainCI, nullptr);
  backbufferImages = device->getSwapchainImagesKHR(*swapChain);
  backbufferImageFormat = format.format;
  backbufferExtent = extent;

  for(auto & image: backbufferImages) {
    transitionImageLayout(vk::ImageLayout::eUndefined, vk::ImageLayout::ePresentSrcKHR, image);
  }
}

void AsyVkRender::createOffscreenBuffers() {
  backbufferExtent=vk::Extent2D(Width, Height);

  auto usageBits=vk::ImageUsageFlagBits::eColorAttachment |
    vk::ImageUsageFlagBits::eTransferSrc;

  if(fxaa)
    usageBits=usageBits | vk::ImageUsageFlagBits::eTransferDst;

  defaultBackbufferImg = createImage(
          backbufferExtent.width,
          backbufferExtent.height,
              vk::SampleCountFlagBits::e1, backbufferImageFormat,
          usageBits,
              VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
  backbufferImages.emplace_back(defaultBackbufferImg.getImage());

  for(auto & image: backbufferImages) {
    transitionImageLayout(vk::ImageLayout::eUndefined, vk::ImageLayout::eColorAttachmentOptimal, image);
  }
}

void AsyVkRender::createImageViews()
{
  auto const bufferCount= backbufferImages.size();
  backbufferImageViews.clear();
  backbufferImageViews.reserve(bufferCount);
  for (size_t i= 0; i < bufferCount; ++i)
  {
    vk::ImageViewCreateInfo const viewCI(
            vk::ImageViewCreateFlags(),
            backbufferImages[i],
            vk::ImageViewType::e2D,
            backbufferImageFormat,
            vk::ComponentMapping(),
            vk::ImageSubresourceRange(vk::ImageAspectFlagBits::eColor, 0, 1, 0, 1)
    );
    auto const& imgView= backbufferImageViews.emplace_back(device->createImageViewUnique(viewCI, nullptr));

    setDebugObjectName(*imgView, "backbufferImageView" + std::to_string(i));
  }
}


vk::UniqueShaderModule AsyVkRender::createShaderModule(EShLanguage lang, std::string const & filename, std::vector<std::string> const & options)
{
  std::string header = "#version 450\n";

  for (auto const & option: options) {
    header += "#define " + option + "\n";
  }
  string filePath = locatefile(string(filename));
  auto fileContents= readFile(filePath.c_str());
  fileContents.emplace_back(0); // terminate string

  std::vector<char> source(header.begin(), header.end());
  source.insert(source.end(), fileContents.begin(), fileContents.end());

  std::vector<const char*> const shaderSources {source.data()};
  auto const res = getShaderResources();
  auto const compileMessages = EShMessages(EShMsgSpvRules | EShMsgVulkanRules);
  auto shader = glslang::TShader(lang);
  glslang::TProgram program;
  std::vector<std::uint32_t> spirv;

  shader.setStrings(shaderSources.data(), shaderSources.size());

  if (!shader.parse(&res, 100, false, compileMessages)) {
    std::stringstream s(fileContents.data());
    std::string line;
    unsigned int k=0;
    while(getline(s,line))
      cerr << ++k << ": " << line << std::endl;
    runtimeError("\n failed to parse "
                             + filename
                             + ":\n" + shader.getInfoLog()
                             + " " + shader.getInfoDebugLog());
  }

  program.addShader(&shader);

  if (!program.link(compileMessages)) {
    runtimeError("failed to link shader "
                             + filename
                             + ": " + shader.getInfoLog());
  }

  glslang::GlslangToSpv(*program.getIntermediate(lang), spirv);

  auto shaderModuleCI = vk::ShaderModuleCreateInfo(
    {},
    spirv.size() * sizeof(std::uint32_t),
    spirv.data()
  );

  return device->createShaderModuleUnique(shaderModuleCI);
}

void AsyVkRender::createFramebuffers()
{
  depthFramebuffers.resize(backbufferImageViews.size());
  opaqueGraphicsFramebuffers.resize(backbufferImageViews.size());
  graphicsFramebuffers.resize(backbufferImageViews.size());

  for (auto i= 0u; i < backbufferImageViews.size(); i++)
  {
    // If we are in FXAA, render to an immediate frame buffer
    // to be processed by the fxaa compute shader,
    // otherwise,
    // render directly into swap chain backbuffer
    // still, we should really be moving to scene graphs.
    // The code will get more complicated as times go on
    // (what about multiple post-processing stages, multiple shaders, shadow maps, etc?)

    vk::ImageView const finalRenderTarget =
            fxaa ? *immRenderTargetViews[i]
                 : *backbufferImageViews[i];

    std::array<vk::ImageView, 3> attachments= {*colorImageView, *depthImageView, finalRenderTarget};

    auto depthFramebufferCI = vk::FramebufferCreateInfo(
      {},
      *countRenderPass,
      0, nullptr, backbufferExtent.width, backbufferExtent.height,
      1
    );
    auto opaqueGraphicsFramebufferCI = vk::FramebufferCreateInfo(
      vk::FramebufferCreateFlags(),
      *opaqueGraphicsRenderPass,
      STD_ARR_VIEW(attachments),
      backbufferExtent.width,
      backbufferExtent.height,
      1
    );
    auto graphicsFramebufferCI = vk::FramebufferCreateInfo(
      vk::FramebufferCreateFlags(),
      *graphicsRenderPass,
      STD_ARR_VIEW(attachments),
      backbufferExtent.width,
      backbufferExtent.height,
      1
    );

    depthFramebuffers[i]= device->createFramebufferUnique(depthFramebufferCI);
    opaqueGraphicsFramebuffers[i]= device->createFramebufferUnique(opaqueGraphicsFramebufferCI);
    graphicsFramebuffers[i]= device->createFramebufferUnique(graphicsFramebufferCI);

    setDebugObjectName(*depthFramebuffers[i], "depthFrameBuffer" + std::to_string(i));
    setDebugObjectName(*opaqueGraphicsFramebuffers[i], "opaqueGraphicsFramebuffers" + std::to_string(i));
    setDebugObjectName(*graphicsFramebuffers[i], "graphicsFramebuffers" + std::to_string(i));
  }
}

void AsyVkRender::createCommandPools()
{
  auto transferPoolCI = vk::CommandPoolCreateInfo(vk::CommandPoolCreateFlagBits::eResetCommandBuffer, queueFamilyIndices.transferQueueFamily);
  transferCommandPool = device->createCommandPoolUnique(transferPoolCI);
  auto renderPoolCI = vk::CommandPoolCreateInfo(vk::CommandPoolCreateFlagBits::eResetCommandBuffer, queueFamilyIndices.renderQueueFamily);
  renderCommandPool = device->createCommandPoolUnique(renderPoolCI);
}

void AsyVkRender::createCommandBuffers()
{
  auto renderAllocInfo = vk::CommandBufferAllocateInfo(*renderCommandPool, vk::CommandBufferLevel::ePrimary, static_cast<uint32_t>(maxFramesInFlight * 4));
  auto transferAllocInfo = vk::CommandBufferAllocateInfo(*transferCommandPool, vk::CommandBufferLevel::ePrimary, static_cast<uint32_t>(maxFramesInFlight));
  auto renderCommands = device->allocateCommandBuffersUnique(renderAllocInfo);
  auto transferCommands = device->allocateCommandBuffersUnique(transferAllocInfo);

  for (int i = 0; i < maxFramesInFlight; i++)
  {
    frameObjects[i].commandBuffer = std::move(renderCommands[4 * i]);
    frameObjects[i].countCommandBuffer = std::move(renderCommands[4 * i + 1]);
    frameObjects[i].computeCommandBuffer = std::move(renderCommands[4 * i + 2]);
    frameObjects[i].partialSumsCommandBuffer = std::move(renderCommands[4 * i + 3]);
    frameObjects[i].copyCountCommandBuffer = std::move(transferCommands[i]);
  }
}

vk::CommandBuffer AsyVkRender::beginSingleCommands()
{
  auto const info = vk::CommandBufferAllocateInfo(
    *renderCommandPool,
    vk::CommandBufferLevel::ePrimary,
    1,
    nullptr
  );

  auto const cmd = device->allocateCommandBuffers(info)[0];

  cmd.begin(vk::CommandBufferBeginInfo(
    vk::CommandBufferUsageFlagBits::eOneTimeSubmit
  ));

  return cmd;
}

void AsyVkRender::endSingleCommands(vk::CommandBuffer cmd)
{
  vk::UniqueFence fence = device->createFenceUnique(vk::FenceCreateInfo());

  if (!fence.get()) {
    std::cout << "Fence failed to allocate" << std::endl;
  }

  cmd.end();

  auto info = vk::SubmitInfo();

  info.commandBufferCount = 1;
  info.pCommandBuffers = &cmd;

  vkutils::checkVkResult(renderQueue.submit(1, &info, *fence)); // todo transfer queue
  vkutils::checkVkResult(device->waitForFences(
    1, &*fence, true, std::numeric_limits<std::uint64_t>::max()
  ));

  device->freeCommandBuffers(*renderCommandPool, 1, &cmd);
}

void AsyVkRender::createSyncObjects()
{
  renderTimelineSemaphore = createTimelineSemaphore(0);

  for (auto i = 0; i < maxFramesInFlight; i++) {
    frameObjects[i].imageAvailableSemaphore = device->createSemaphoreUnique(vk::SemaphoreCreateInfo());
    frameObjects[i].inCountBufferCopy = device->createSemaphoreUnique(vk::SemaphoreCreateInfo());
    frameObjects[i].transferDoneSemaphore = device->createSemaphoreUnique(vk::SemaphoreCreateInfo());
    frameObjects[i].inFlightFence = device->createFenceUnique(vk::FenceCreateInfo(vk::FenceCreateFlagBits::eSignaled));
    frameObjects[i].inComputeFence = device->createFenceUnique(vk::FenceCreateInfo(vk::FenceCreateFlagBits::eSignaled));
    frameObjects[i].compressionFinishedEvent = device->createEventUnique(vk::EventCreateInfo());
    frameObjects[i].sumFinishedEvent = device->createEventUnique(vk::EventCreateInfo());
    frameObjects[i].startTimedSumsEvent = device->createEventUnique(vk::EventCreateInfo());
    frameObjects[i].timedSumsFinishedEvent = device->createEventUnique(vk::EventCreateInfo());
  }
}

void AsyVkRender::waitForEvent(vk::Event event) {
  while (device->getEventStatus(event) != vk::Result::eEventSet) {}
}

uint32_t AsyVkRender::selectMemory(const vk::MemoryRequirements memRequirements, const vk::MemoryPropertyFlags properties)
{
  auto memProperties = physicalDevice.getMemoryProperties();
  for (uint32_t i = 0; i < memProperties.memoryTypeCount; i++)
    if ((memRequirements.memoryTypeBits & (1u << i)) &&
        ((properties & memProperties.memoryTypes[i].propertyFlags) == properties))
      return i;
  runtimeError("failed to find suitable memory type");
  exit(-1);
}

vma::cxx::UniqueBuffer AsyVkRender::createBufferUnique(
        vk::BufferUsageFlags const& usage,
        VkMemoryPropertyFlags const& properties,
        vk::DeviceSize const& size,
        VmaAllocationCreateFlags const& vmaFlags,
        VmaMemoryUsage const& memoryUsage,
        const char * bufferName)
{
  auto bufferCI = vk::BufferCreateInfo(vk::BufferCreateFlags(), size, usage);

  VmaAllocationCreateInfo createInfo = {};
  createInfo.usage = memoryUsage;
  createInfo.requiredFlags = properties;
  createInfo.flags=vmaFlags;

  if (bufferName != nullptr && settings::verbose > 2) {
    std::cout << "Creating buffer " << bufferName << " of size: " << size << std::endl;
  }

  return allocator.createBuffer(bufferCI, createInfo);
}

void AsyVkRender::copyBufferToBuffer(const vk::Buffer& srcBuffer, const vk::Buffer& dstBuffer, const vk::DeviceSize size)
{
  auto allocInfo = vk::CommandBufferAllocateInfo(*transferCommandPool, vk::CommandBufferLevel::ePrimary, 1);
  auto commandBuffer = std::move(device->allocateCommandBuffersUnique(allocInfo)[0]);

  auto commandBufferBeginInfo = vk::CommandBufferBeginInfo(vk::CommandBufferUsageFlagBits::eOneTimeSubmit);
  commandBuffer->begin(commandBufferBeginInfo);
  auto copyRegion = vk::BufferCopy(0, 0, size);
  commandBuffer->copyBuffer(srcBuffer, dstBuffer, copyRegion);
  commandBuffer->end();

  auto fence = device->createFenceUnique(vk::FenceCreateInfo());
  auto submitInfo = vk::SubmitInfo(0, nullptr, nullptr, 1, &*commandBuffer);
  auto submitResult = transferQueue.submit(1, &submitInfo, *fence);
  if (submitResult != vk::Result::eSuccess)
    runtimeError("failed to submit command buffer");
  vkutils::checkVkResult(device->waitForFences(
    1, &*fence, VK_TRUE, vkTimeout
  ));
}

void AsyVkRender::recordBufferCopy(vk::CommandBuffer cmd, const vk::Buffer& srcBuffer, const vk::Buffer& dstBuffer, const vk::DeviceSize size)
{
  auto copyRegion = vk::BufferCopy(0, 0, size);
  cmd.copyBuffer(srcBuffer, dstBuffer, copyRegion);
}

void AsyVkRender::beginTransferRecording(FrameObject & object)
{
  object.transferHasPendingWork = false;
  if (object.copyCountCommandBuffer) {
    // Wait for any prior GPU submission of this command buffer to complete,
    // then reset both the fence and command buffer before re-recording.
    if (object.transferFence) {
      vkutils::checkVkResult(device->waitForFences(1, &*object.transferFence, VK_TRUE, vkTimeout));
      vkutils::checkVkResult(device->resetFences(1, &*object.transferFence));
    }
    object.copyCountCommandBuffer->reset();
    object.copyCountCommandBuffer->begin(vk::CommandBufferBeginInfo());
  }
}

void AsyVkRender::endAndSubmitTransfers(FrameObject & object, vk::Queue queue)
{
  if (!object.copyCountCommandBuffer)
    return;

  object.copyCountCommandBuffer->end();

  // Only signal the transfer-done semaphore when there is actual transfer work.
  // This prevents double-signaling (when called twice per frame in mixed scenes)
  // which would leave residual signals causing incorrect synchronization on subsequent frames.
  std::vector<vk::Semaphore> signalSems;
  if (object.transferHasPendingWork)
    signalSems.push_back(*object.transferDoneSemaphore);

  auto submitInfo = vk::SubmitInfo(0, nullptr, nullptr, 1, &*object.copyCountCommandBuffer,
                                   signalSems.size(), signalSems.data());

  // Create a fence to track when this transfer submission completes.
  // Created without eSignaled so the first submission is valid (unsignaled fence).
  // The fence is always submitted (even with an empty command buffer) to ensure
  // beginTransferRecording's waitForFences on the next frame doesn't timeout.
  if (!object.transferFence)
    object.transferFence = device->createFenceUnique(vk::FenceCreateInfo());

  // Reset the fence before submission.  The fence is reused across frames and
  // may have been signaled by a prior submit (e.g. the GPUcompress path in
  // refreshBuffers submits and waits on the same fence earlier in the same frame).
  vkutils::checkVkResult(device->resetFences(1, &*object.transferFence));

  vkutils::checkVkResult(queue.submit(1, &submitInfo, *object.transferFence));
}

void AsyVkRender::copyToBuffer(
        const vk::Buffer& buffer,
        const void* data,
        vk::DeviceSize size,
        vma::cxx::UniqueBuffer const& stagingBuffer
        )
{
  vma::cxx::MemoryMapperLock const stgBufMemPtr(stagingBuffer);
  memcpy(stgBufMemPtr.getCopyPtr(), data, size);
  copyBufferToBuffer(stagingBuffer.getBuffer(), buffer, size);
}

void AsyVkRender::setDebugObjectName(
        uint64_t const& object,
        vk::DebugReportObjectTypeEXT const& objType,
        std::string const& name
        )
{
#if defined(DEBUG)
  if (hasDebugMarker)
  {
    vk::DebugMarkerObjectNameInfoEXT const tagInfo(objType, object, name.c_str());
    device->debugMarkerSetObjectNameEXT(tagInfo);
  }
#endif
}

void AsyVkRender::copyToBuffer(
        const vk::Buffer& buffer,
        const void* data,
        vk::DeviceSize size
)
{
  vma::cxx::UniqueBuffer copyToStageBf = createBufferUnique(
          vk::BufferUsageFlagBits::eTransferSrc,
          VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
          size,
          VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT,
          VMA_MEMORY_USAGE_AUTO,
          VARIABLE_NAME(copyToStageBf)
          );

  copyToBuffer(buffer, data, size, copyToStageBf);
}

void AsyVkRender::zeroBuffer(vk::CommandBuffer const& cmdBuffer,
                             vk::Buffer const& buffer)
{
  cmdBuffer.fillBuffer(buffer, 0, vk::WholeSize, 0);
}

vma::cxx::UniqueImage AsyVkRender::createImage(
        std::uint32_t w, std::uint32_t h, vk::SampleCountFlagBits samples, vk::Format fmt, vk::ImageUsageFlags usage,
        VkMemoryPropertyFlags props, vk::ImageType type, std::uint32_t depth
)
{
  auto info = vk::ImageCreateInfo();

  info.imageType      = type;
  info.extent         = vk::Extent3D(w, h, depth);
  info.mipLevels      = 1;
  info.arrayLayers    = 1;
  info.format         = fmt;
  info.tiling         = vk::ImageTiling::eOptimal;
  info.initialLayout  = vk::ImageLayout::eUndefined;
  info.usage          = usage;
  info.sharingMode    = vk::SharingMode::eExclusive;
  info.samples        = samples;

  VmaAllocationCreateInfo allocCreateInfo = {};
  allocCreateInfo.requiredFlags= props;
  allocCreateInfo.usage = VMA_MEMORY_USAGE_AUTO;


  return allocator.createImage(info, allocCreateInfo);
}

void AsyVkRender::createImageView(vk::Format fmt,
                                  vk::ImageAspectFlagBits flags,
                                  vk::Image const& img,
                                  vk::UniqueImageView& imgView,
                                  vk::ImageViewType type)
{
  auto info = vk::ImageViewCreateInfo();

  info.image = img;
  info.viewType = type;
  info.format = fmt;
  info.components = vk::ComponentMapping();
  info.subresourceRange = vk::ImageSubresourceRange(
    flags,
    0,
    1,
    0,
    1
  );

  imgView = device->createImageViewUnique(info);
}

void AsyVkRender::copyFromBuffer(const vk::Buffer& buffer, void* data,
                                 vk::DeviceSize size)
{
  vma::cxx::UniqueBuffer copyFromStageBf= createBufferUnique(
          vk::BufferUsageFlagBits::eTransferDst,
          VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
          size,
          VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT,
          VMA_MEMORY_USAGE_AUTO,
          VARIABLE_NAME(copyFromStageBf)
  );

  auto const cmd = beginSingleCommands();
  auto const cpy = vk::BufferCopy(
    0, 0, size
  );

  cmd.copyBuffer(buffer, copyFromStageBf.getBuffer(), 1, &cpy);

  endSingleCommands(cmd);

  vma::cxx::MemoryMapperLock const mappedMem(copyFromStageBf);
  memcpy(data, mappedMem.getCopyPtr(), size);
}

void AsyVkRender::createImageSampler(vk::UniqueSampler & sampler)
{
  auto info = vk::SamplerCreateInfo(
    vk::SamplerCreateFlags(),
    vk::Filter::eLinear,
    vk::Filter::eLinear,
    vk::SamplerMipmapMode::eNearest,
    vk::SamplerAddressMode::eRepeat,
    vk::SamplerAddressMode::eClampToEdge,
    vk::SamplerAddressMode::eClampToEdge,
    0.f,
    false,
    0.f,
    false,
    vk::CompareOp::eAlways,
    0.f,
    0.f
  );

  sampler = device->createSamplerUnique(info);
}

void AsyVkRender::transitionImageLayout(vk::ImageLayout from,
                                        vk::ImageLayout to, vk::Image img)
{
  auto const cmd = beginSingleCommands();
  auto barrier = vk::ImageMemoryBarrier(
    vk::AccessFlagBits::eMemoryWrite,
    vk::AccessFlagBits::eMemoryWrite,
    from,
    to,
    VK_QUEUE_FAMILY_IGNORED,
    VK_QUEUE_FAMILY_IGNORED,
    img
  );

  barrier.subresourceRange.aspectMask = vk::ImageAspectFlagBits::eColor;
  barrier.subresourceRange.baseMipLevel = 0;
  barrier.subresourceRange.levelCount = 1;
  barrier.subresourceRange.baseArrayLayer = 0;
  barrier.subresourceRange.layerCount = 1;
  cmd.pipelineBarrier(vk::PipelineStageFlagBits::eTopOfPipe,
                      vk::PipelineStageFlagBits::eTransfer,
                      {},
                      0,
                      nullptr,
                      0,
                      nullptr,
                      1,
                      &barrier);
  endSingleCommands(cmd);
}

void AsyVkRender::copyDataToImage(const void *data, vk::DeviceSize size,
                                  vk::Image img, std::uint32_t w,
                                  std::uint32_t h,
                                  vk::Offset3D const & offset)
{
  vma::cxx::UniqueBuffer copyToImageStageBf = createBufferUnique(
          vk::BufferUsageFlagBits::eTransferSrc,
          VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
          size,
          VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT,
          VMA_MEMORY_USAGE_AUTO,
          VARIABLE_NAME(copyToImageStageBf)
    );

  {
    vma::cxx::MemoryMapperLock mappedMem(copyToImageStageBf);
    memcpy(mappedMem.getCopyPtr<uint8_t>(), data, size);
  }

  auto const cmd = beginSingleCommands();
  auto cpy = vk::BufferImageCopy(
    0,
    0,
    0
  );

  cpy.imageSubresource.aspectMask = vk::ImageAspectFlagBits::eColor;
  cpy.imageSubresource.mipLevel = 0;
  cpy.imageSubresource.baseArrayLayer = 0;
  cpy.imageSubresource.layerCount = 1;
  cpy.imageOffset = offset;
  cpy.imageExtent = vk::Extent3D {
      w,
      h,
      1
  };

  cmd.copyBufferToImage(copyToImageStageBf.getBuffer(), img, vk::ImageLayout::eTransferDstOptimal, 1, &cpy);

  endSingleCommands(cmd);
}

void AsyVkRender::uploadPersistentBuffer(FrameBufferPair& bufpair, const void* data,
                                          vk::DeviceSize size, size_t nobjects,
                                          vk::BufferUsageFlags usage,
                                          VkMemoryPropertyFlagBits properties,
                                          bool isVertex)
{
  // Vulkan doesn't allow a buffer to have a size of 0
  size = std::max(vk::DeviceSize(16), size);

  bufpair.nobjects = nobjects;

  auto& dstBuffer = isVertex ? bufpair.vertexBuffer : bufpair.indexBuffer;
  auto& dstSize = isVertex ? bufpair.vertexBufferSize : bufpair.indexBufferSize;
  auto& stgBuffer = isVertex ? bufpair.vertexStagingBuffer : bufpair.indexStagingBuffer;
  auto& stgSize = isVertex ? bufpair.vertexStgSize : bufpair.indexStgSize;

  // Reuse existing persistent device buffer if it has sufficient size
  if (dstBuffer.getBuffer() == VK_NULL_HANDLE || dstSize < size) {
    dstBuffer = createBufferUnique(usage, properties, size);
    dstSize = size;
  }

  // Grow staging buffer if needed
  if (size > stgSize) {
    vk::DeviceSize newSize = 16;
    while (newSize < size) newSize *= 2;
    stgSize = newSize;

    stgBuffer = createBufferUnique(
      vk::BufferUsageFlagBits::eTransferSrc,
      VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
      stgSize,
      VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT
      );
  }

  if (data) {
    // Copy data into the staging buffer on the CPU side
    {
      vma::cxx::MemoryMapperLock const stgBufMemPtr(stgBuffer);
      memcpy(stgBufMemPtr.getCopyPtr(), data, size);
    }
    // Record the GPU copy command into the frame's transfer command buffer
    auto& frame = frameObjects[currentFrame];
    if (!frame.copyCountCommandBuffer) {
      // Fallback: use synchronous copy if no transfer command buffer available
      copyToBuffer(dstBuffer.getBuffer(), data, size, stgBuffer);
    } else {
      recordBufferCopy(*frame.copyCountCommandBuffer,
                       stgBuffer.getBuffer(),
                       dstBuffer.getBuffer(),
                       size);
      frame.transferHasPendingWork = true;
    }
  }
}

void AsyVkRender::createDescriptorSetLayout()
{
  auto uboLayoutBinding = vk::DescriptorSetLayoutBinding(
    0,
    vk::DescriptorType::eUniformBuffer,
    1,
    vk::ShaderStageFlagBits::eVertex | vk::ShaderStageFlagBits::eFragment
  );
  auto materialBufferBinding = vk::DescriptorSetLayoutBinding(
    1,
    vk::DescriptorType::eStorageBuffer,
    1,
    vk::ShaderStageFlagBits::eVertex | vk::ShaderStageFlagBits::eFragment
  );
  auto lightBufferBinding = vk::DescriptorSetLayoutBinding(
    2,
    vk::DescriptorType::eStorageBuffer,
    1,
    vk::ShaderStageFlagBits::eVertex | vk::ShaderStageFlagBits::eFragment
  );
  auto countBufferBinding = vk::DescriptorSetLayoutBinding(
    3,
    vk::DescriptorType::eStorageBuffer,
    1,
    vk::ShaderStageFlagBits::eFragment
  );
  auto offsetBufferBinding = vk::DescriptorSetLayoutBinding(
    4,
    vk::DescriptorType::eStorageBuffer,
    1,
    vk::ShaderStageFlagBits::eFragment
  );
  auto fragmentBufferBinding = vk::DescriptorSetLayoutBinding(
    5,
    vk::DescriptorType::eStorageBuffer,
    1,
    vk::ShaderStageFlagBits::eFragment
  );
  auto depthBufferBinding = vk::DescriptorSetLayoutBinding(
    6,
    vk::DescriptorType::eStorageBuffer,
    1,
    vk::ShaderStageFlagBits::eFragment
  );
  auto opaqueBufferBinding = vk::DescriptorSetLayoutBinding(
    7,
    vk::DescriptorType::eStorageBuffer,
    1,
    vk::ShaderStageFlagBits::eFragment
  );
  auto opaqueDepthBufferBinding = vk::DescriptorSetLayoutBinding(
    8,
    vk::DescriptorType::eStorageBuffer,
    1,
    vk::ShaderStageFlagBits::eFragment
  );
  auto indexBufferBinding = vk::DescriptorSetLayoutBinding(
    9,
    vk::DescriptorType::eStorageBuffer,
    1,
    vk::ShaderStageFlagBits::eFragment
  );
  auto elementBufferBinding = vk::DescriptorSetLayoutBinding(
    10,
    vk::DescriptorType::eStorageBuffer,
    1,
    vk::ShaderStageFlagBits::eFragment
  );
  auto irradianceSamplerBinding = vk::DescriptorSetLayoutBinding(
    11,
    vk::DescriptorType::eCombinedImageSampler,
    1,
    vk::ShaderStageFlagBits::eFragment
  );
  auto brdfSamplerBinding = vk::DescriptorSetLayoutBinding(
    12,
    vk::DescriptorType::eCombinedImageSampler,
    1,
    vk::ShaderStageFlagBits::eFragment
  );
  auto reflectionSamplerBinding = vk::DescriptorSetLayoutBinding(
    13,
    vk::DescriptorType::eCombinedImageSampler,
    1,
    vk::ShaderStageFlagBits::eFragment
  );

  std::vector<vk::DescriptorSetLayoutBinding> layoutBindings {
    uboLayoutBinding,
    materialBufferBinding,
    lightBufferBinding,
    countBufferBinding,
    offsetBufferBinding,
    fragmentBufferBinding,
    depthBufferBinding,
    opaqueBufferBinding,
    opaqueDepthBufferBinding,
    indexBufferBinding,
    elementBufferBinding
  };

  if (ibl) {
    layoutBindings.emplace_back(irradianceSamplerBinding);
    layoutBindings.emplace_back(brdfSamplerBinding);
    layoutBindings.emplace_back(reflectionSamplerBinding);
  }

  auto layoutCI = vk::DescriptorSetLayoutCreateInfo(
    vk::DescriptorSetLayoutCreateFlags(),
    layoutBindings.size(),
    &layoutBindings[0]
  );
  materialDescriptorSetLayout = device->createDescriptorSetLayoutUnique(layoutCI);
}

void AsyVkRender::createComputeDescriptorSetLayout()
{
  std::vector<vk::DescriptorSetLayoutBinding> layoutBindings
  {
    vk::DescriptorSetLayoutBinding(0, vk::DescriptorType::eStorageBuffer, 1, vk::ShaderStageFlagBits::eCompute),
    vk::DescriptorSetLayoutBinding(1, vk::DescriptorType::eStorageBuffer, 1, vk::ShaderStageFlagBits::eCompute),
    vk::DescriptorSetLayoutBinding(2, vk::DescriptorType::eStorageBuffer, 1, vk::ShaderStageFlagBits::eCompute),
    vk::DescriptorSetLayoutBinding(3, vk::DescriptorType::eStorageBuffer, 1, vk::ShaderStageFlagBits::eCompute)
  };
  auto layoutCI = vk::DescriptorSetLayoutCreateInfo(
    vk::DescriptorSetLayoutCreateFlags(),
    layoutBindings.size(),
    &layoutBindings[0]
  );

  computeDescriptorSetLayout = device->createDescriptorSetLayoutUnique(layoutCI);

  if (fxaa)
  {
    std::vector<vk::DescriptorSetLayoutBinding> const postProcessingLayoutBindings{
            {0, vk::DescriptorType::eCombinedImageSampler, 1, vk::ShaderStageFlagBits::eCompute},
            {1, vk::DescriptorType::eStorageImage, 1, vk::ShaderStageFlagBits::eCompute},
            {2, vk::DescriptorType::eStorageImage, 1, vk::ShaderStageFlagBits::eCompute},
    };

    postProcessDescSetLayout= device->createDescriptorSetLayoutUnique({{}, VEC_VIEW(postProcessingLayoutBindings)});
  }
}

void AsyVkRender::createDescriptorPool()
{
  std::vector<vk::DescriptorPoolSize> poolSizes;

  poolSizes.resize(11);
  poolSizes[0].type = vk::DescriptorType::eUniformBuffer;
  poolSizes[0].descriptorCount = maxFramesInFlight;

  poolSizes[1].type = vk::DescriptorType::eStorageBuffer;
  poolSizes[1].descriptorCount = maxFramesInFlight;

  poolSizes[2].type = vk::DescriptorType::eStorageBuffer;
  poolSizes[2].descriptorCount = maxFramesInFlight;

  poolSizes[3].type = vk::DescriptorType::eStorageBuffer;
  poolSizes[3].descriptorCount = maxFramesInFlight;

  poolSizes[4].type = vk::DescriptorType::eStorageBuffer;
  poolSizes[4].descriptorCount = maxFramesInFlight;

  poolSizes[5].type = vk::DescriptorType::eStorageBuffer;
  poolSizes[5].descriptorCount = maxFramesInFlight;

  poolSizes[6].type = vk::DescriptorType::eStorageBuffer;
  poolSizes[6].descriptorCount = maxFramesInFlight;

  poolSizes[7].type = vk::DescriptorType::eStorageBuffer;
  poolSizes[7].descriptorCount = maxFramesInFlight;

  poolSizes[8].type = vk::DescriptorType::eStorageBuffer;
  poolSizes[8].descriptorCount = maxFramesInFlight;

  poolSizes[9].type = vk::DescriptorType::eStorageBuffer;
  poolSizes[9].descriptorCount = maxFramesInFlight;

  poolSizes[10].type = vk::DescriptorType::eStorageBuffer;
  poolSizes[10].descriptorCount = maxFramesInFlight;

  if (ibl) {
    poolSizes.emplace_back(
      vk::DescriptorPoolSize(
        vk::DescriptorType::eCombinedImageSampler,
        maxFramesInFlight
      )
    );
    poolSizes.emplace_back(
      vk::DescriptorPoolSize(
        vk::DescriptorType::eCombinedImageSampler,
        maxFramesInFlight
      )
    );
    poolSizes.emplace_back(
      vk::DescriptorPoolSize(
        vk::DescriptorType::eCombinedImageSampler,
        maxFramesInFlight
      )
    );
  }

  auto poolCI = vk::DescriptorPoolCreateInfo(
    vk::DescriptorPoolCreateFlagBits::eFreeDescriptorSet,
    maxFramesInFlight,
    poolSizes.size(),
    &poolSizes[0]
  );
  descriptorPool = device->createDescriptorPoolUnique(poolCI);
}

void AsyVkRender::createComputeDescriptorPool()
{
  std::array<vk::DescriptorPoolSize, 4> poolSizes;

  // countBuffer
  poolSizes[0].type = vk::DescriptorType::eStorageBuffer;
  poolSizes[0].descriptorCount = 1;

  // globalSumBuffer
  poolSizes[1].type = vk::DescriptorType::eStorageBuffer;
  poolSizes[1].descriptorCount = 1;

  // offsetBuffer
  poolSizes[2].type = vk::DescriptorType::eStorageBuffer;
  poolSizes[2].descriptorCount = 1;

  // feedbackBuffer
  poolSizes[3].type = vk::DescriptorType::eStorageBuffer;
  poolSizes[3].descriptorCount = 1;

  auto poolCI = vk::DescriptorPoolCreateInfo(
    vk::DescriptorPoolCreateFlagBits::eFreeDescriptorSet,
    1,
    poolSizes.size(),
    &poolSizes[0]
  );
  computeDescriptorPool = device->createDescriptorPoolUnique(poolCI);

  if (fxaa)
  {
    auto const poolSetCount= static_cast<uint32_t>(backbufferImages.size());

    std::vector<vk::DescriptorPoolSize> const postProcPoolSizes{
            {vk::DescriptorType::eCombinedImageSampler, poolSetCount},// input image
            {vk::DescriptorType::eStorageImage, poolSetCount},        // input image, non-sampled
            {vk::DescriptorType::eStorageImage, poolSetCount},        // output image image
    };

    postProcessDescPool= device->createDescriptorPoolUnique(
            {vk::DescriptorPoolCreateFlagBits::eFreeDescriptorSet, poolSetCount, VEC_VIEW(postProcPoolSizes)}
    );
  }
}

void AsyVkRender::createDescriptorSets()
{
  std::vector<vk::DescriptorSetLayout> layouts(maxFramesInFlight, *materialDescriptorSetLayout);
  auto allocInfo = vk::DescriptorSetAllocateInfo(
    *descriptorPool,
    VEC_VIEW(layouts)
  );
  auto descriptorSets = device->allocateDescriptorSetsUnique(allocInfo);

  for (auto i = 0; i < maxFramesInFlight; i++)
    frameObjects[i].descriptorSet = std::move(descriptorSets[i]);

  auto computeAllocInfo = vk::DescriptorSetAllocateInfo(
    *computeDescriptorPool,
    1,
    &*computeDescriptorSetLayout
  );

  computeDescriptorSet = std::move(device->allocateDescriptorSetsUnique(computeAllocInfo)[0]);

  // post processing descs

  if (fxaa)
  {
    std::vector postProcessDescLayouts(backbufferImages.size(), *postProcessDescSetLayout);
    postProcessDescSet= device->allocateDescriptorSetsUnique({*postProcessDescPool, VEC_VIEW(postProcessDescLayouts)});
  }
}

void AsyVkRender::writeDescriptorSets()
{
  for (auto i = 0; i < maxFramesInFlight; i++) {
    auto uboInfo = vk::DescriptorBufferInfo();

    uboInfo.buffer = frameObjects[i].uboBf.getBuffer();
    uboInfo.offset = 0;
    uboInfo.range = sizeof(UniformBufferObject);

    std::array<vk::WriteDescriptorSet, 7> writes;

    writes[0].dstSet = *frameObjects[i].descriptorSet;
    writes[0].dstBinding = 0;
    writes[0].dstArrayElement = 0;
    writes[0].descriptorType = vk::DescriptorType::eUniformBuffer;
    writes[0].descriptorCount = 1;
    writes[0].pBufferInfo = &uboInfo;

    if(!Opaque) {
      auto countBufferInfo = vk::DescriptorBufferInfo();

      countBufferInfo.buffer = countBf.getBuffer();
      countBufferInfo.offset = 0;
      countBufferInfo.range = countBufferSize;

      auto offsetBufferInfo = vk::DescriptorBufferInfo();

      offsetBufferInfo.buffer = offsetBf.getBuffer();
      offsetBufferInfo.offset = 0;
      offsetBufferInfo.range = offsetBufferSize;

      auto opaqueBufferInfo = vk::DescriptorBufferInfo();

      opaqueBufferInfo.buffer = opaqueBf.getBuffer();
      opaqueBufferInfo.offset = 0;
      opaqueBufferInfo.range = opaqueBufferSize;

      auto opaqueDepthBufferInfo = vk::DescriptorBufferInfo();

      opaqueDepthBufferInfo.buffer = opaqueDepthBf.getBuffer();
      opaqueDepthBufferInfo.offset = 0;
      opaqueDepthBufferInfo.range = opaqueDepthBufferSize;

      auto indexBufferInfo = vk::DescriptorBufferInfo();

      indexBufferInfo.buffer = indexBf.getBuffer();
      indexBufferInfo.offset = 0;
      indexBufferInfo.range = indexBufferSize;

      auto elementBufferInfo = vk::DescriptorBufferInfo();

      elementBufferInfo.buffer = elementBf.getBuffer();
      elementBufferInfo.offset = 0;
      elementBufferInfo.range = elementBufferSize;

      writes[1].dstSet = *frameObjects[i].descriptorSet;
      writes[1].dstBinding = 3;
      writes[1].dstArrayElement = 0;
      writes[1].descriptorType = vk::DescriptorType::eStorageBuffer;
      writes[1].descriptorCount = 1;
      writes[1].pBufferInfo = &countBufferInfo;

      writes[2].dstSet = *frameObjects[i].descriptorSet;
      writes[2].dstBinding = 4;
      writes[2].dstArrayElement = 0;
      writes[2].descriptorType = vk::DescriptorType::eStorageBuffer;
      writes[2].descriptorCount = 1;
      writes[2].pBufferInfo = &offsetBufferInfo;

      writes[3].dstSet = *frameObjects[i].descriptorSet;
      writes[3].dstBinding = 7;
      writes[3].dstArrayElement = 0;
      writes[3].descriptorType = vk::DescriptorType::eStorageBuffer;
      writes[3].descriptorCount = 1;
      writes[3].pBufferInfo = &opaqueBufferInfo;

      writes[4].dstSet = *frameObjects[i].descriptorSet;
      writes[4].dstBinding = 8;
      writes[4].dstArrayElement = 0;
      writes[4].descriptorType = vk::DescriptorType::eStorageBuffer;
      writes[4].descriptorCount = 1;
      writes[4].pBufferInfo = &opaqueDepthBufferInfo;

      if(GPUcompress) {
        writes[5].dstSet = *frameObjects[i].descriptorSet;
        writes[5].dstBinding = 9;
        writes[5].dstArrayElement = 0;
        writes[5].descriptorType = vk::DescriptorType::eStorageBuffer;
        writes[5].descriptorCount = 1;
        writes[5].pBufferInfo = &indexBufferInfo;

        writes[6].dstSet = *frameObjects[i].descriptorSet;
        writes[6].dstBinding = 10;
        writes[6].dstArrayElement = 0;
        writes[6].descriptorType = vk::DescriptorType::eStorageBuffer;
        writes[6].descriptorCount = 1;
        writes[6].pBufferInfo = &elementBufferInfo;
      }
    }

    device->updateDescriptorSets(Opaque ? 1 : (GPUcompress ? 7 : 5),
                                 writes.data(), 0, nullptr);
  }

  if(!Opaque) {
    // compute descriptors

    auto countBufferInfo = vk::DescriptorBufferInfo();

    countBufferInfo.buffer = countBf.getBuffer();
    countBufferInfo.offset = 0;
    countBufferInfo.range = countBufferSize;

    auto globalSumBufferInfo = vk::DescriptorBufferInfo();

    globalSumBufferInfo.buffer = globalSumBf.getBuffer();
    globalSumBufferInfo.offset = 0;
    globalSumBufferInfo.range = globalSize;

    auto offsetBufferInfo = vk::DescriptorBufferInfo();

    offsetBufferInfo.buffer = offsetBf.getBuffer();
    offsetBufferInfo.offset = 0;
    offsetBufferInfo.range = offsetBufferSize;

    auto feedbackBufferInfo = vk::DescriptorBufferInfo();

    feedbackBufferInfo.buffer = feedbackBf.getBuffer();
    feedbackBufferInfo.offset = 0;
    feedbackBufferInfo.range = feedbackBufferSize;

    std::array<vk::WriteDescriptorSet, 4> writes;

    writes[0].dstSet = *computeDescriptorSet;
    writes[0].dstBinding = 0;
    writes[0].dstArrayElement = 0;
    writes[0].descriptorType = vk::DescriptorType::eStorageBuffer;
    writes[0].descriptorCount = 1;
    writes[0].pBufferInfo = &countBufferInfo;

    writes[1].dstSet = *computeDescriptorSet;
    writes[1].dstBinding = 1;
    writes[1].dstArrayElement = 0;
    writes[1].descriptorType = vk::DescriptorType::eStorageBuffer;
    writes[1].descriptorCount = 1;
    writes[1].pBufferInfo = &globalSumBufferInfo;

    writes[2].dstSet = *computeDescriptorSet;
    writes[2].dstBinding = 2;
    writes[2].dstArrayElement = 0;
    writes[2].descriptorType = vk::DescriptorType::eStorageBuffer;
    writes[2].descriptorCount = 1;
    writes[2].pBufferInfo = &offsetBufferInfo;

    writes[3].dstSet = *computeDescriptorSet;
    writes[3].dstBinding = 3;
    writes[3].dstArrayElement = 0;
    writes[3].descriptorType = vk::DescriptorType::eStorageBuffer;
    writes[3].descriptorCount = 1;
    writes[3].pBufferInfo = &feedbackBufferInfo;

    device->updateDescriptorSets(writes.size(), writes.data(), 0, nullptr);
  }

  if (ibl) {
    for (auto i = 0; i < maxFramesInFlight; i++) {
      auto irradianceSampInfo = vk::DescriptorImageInfo();

      irradianceSampInfo.imageLayout = vk::ImageLayout::eShaderReadOnlyOptimal;
      irradianceSampInfo.imageView = *irradianceView;
      irradianceSampInfo.sampler = *irradianceSampler;

      auto brdfSampInfo = vk::DescriptorImageInfo();

      brdfSampInfo.imageLayout = vk::ImageLayout::eShaderReadOnlyOptimal;
      brdfSampInfo.imageView = *brdfView;
      brdfSampInfo.sampler = *brdfSampler;

      auto reflSampInfo = vk::DescriptorImageInfo();

      reflSampInfo.imageLayout = vk::ImageLayout::eShaderReadOnlyOptimal;
      reflSampInfo.imageView = *reflectionView;
      reflSampInfo.sampler = *reflectionSampler;

      std::array<vk::WriteDescriptorSet, 3> samplerWrites;

      samplerWrites[0].dstSet = *frameObjects[i].descriptorSet;
      samplerWrites[0].dstBinding = 11;
      samplerWrites[0].dstArrayElement = 0;
      samplerWrites[0].descriptorType = vk::DescriptorType::eCombinedImageSampler;
      samplerWrites[0].descriptorCount = 1;
      samplerWrites[0].pImageInfo = &irradianceSampInfo;

      samplerWrites[1].dstSet = *frameObjects[i].descriptorSet;
      samplerWrites[1].dstBinding = 12;
      samplerWrites[1].dstArrayElement = 0;
      samplerWrites[1].descriptorType = vk::DescriptorType::eCombinedImageSampler;
      samplerWrites[1].descriptorCount = 1;
      samplerWrites[1].pImageInfo = &brdfSampInfo;

      samplerWrites[2].dstSet = *frameObjects[i].descriptorSet;
      samplerWrites[2].dstBinding = 13;
      samplerWrites[2].dstArrayElement = 0;
      samplerWrites[2].descriptorType = vk::DescriptorType::eCombinedImageSampler;
      samplerWrites[2].descriptorCount = 1;
      samplerWrites[2].pImageInfo = &reflSampInfo;

      device->updateDescriptorSets(samplerWrites.size(), samplerWrites.data(), 0, nullptr);
    }
  }

  if (fxaa)
    writePostProcessDescSets();
}

void AsyVkRender::writePostProcessDescSets()
{
  // Ensure we have valid image views before writing descriptor sets
  if (immRenderTargetViews.empty() || prePresentationImgViews.empty() || immRenderTargetSampler.empty()) {
    runtimeError("Attempting to write post-process descriptor sets with empty image views");
  }

  // post process descriptors
  for (size_t i=0; i < backbufferImages.size(); ++i)
  {
    vk::DescriptorImageInfo inputImgInfo(
            *immRenderTargetSampler[i],
            *immRenderTargetViews[i],
            vk::ImageLayout::eGeneral
            );
    vk::DescriptorImageInfo inputImgInfoNonSampled(
            {},
            *immRenderTargetViews[i],
            vk::ImageLayout::eGeneral
    );
    vk::DescriptorImageInfo outputImgInfo({}, *prePresentationImgViews[i], vk::ImageLayout::eGeneral);


    // Ensure we have valid descriptor sets before writing
    if (i >= postProcessDescSet.size() || !postProcessDescSet[i])
      runtimeError("Invalid post-process descriptor set");

    std::vector<vk::WriteDescriptorSet> const postProcDescWrite{
            {*postProcessDescSet[i], 0, 0, 1, vk::DescriptorType::eCombinedImageSampler, &inputImgInfo},
            {*postProcessDescSet[i], 1, 0, 1, vk::DescriptorType::eStorageImage, &inputImgInfoNonSampled},
            {*postProcessDescSet[i], 2, 0, 1, vk::DescriptorType::eStorageImage, &outputImgInfo}
    };

    device->updateDescriptorSets(VEC_VIEW(postProcDescWrite), EMPTY_VIEW);
  }
}

void AsyVkRender::writeMaterialAndLightDescriptors() {
  for (auto i = 0; i < maxFramesInFlight; i++) {
    auto materialBufferInfo = vk::DescriptorBufferInfo();

    materialBufferInfo.buffer = materialBf.getBuffer();
    materialBufferInfo.offset = 0;
    materialBufferInfo.range = sizeof(camp::Material) * nmaterials;

    auto lightBufferInfo = vk::DescriptorBufferInfo();

    lightBufferInfo.buffer = lightBf.getBuffer();
    lightBufferInfo.offset = 0;
    lightBufferInfo.range = sizeof(Light) * nlights;

    std::array<vk::WriteDescriptorSet, 2> writes;

    writes[0].dstSet = *frameObjects[i].descriptorSet;
    writes[0].dstBinding = 1;
    writes[0].dstArrayElement = 0;
    writes[0].descriptorType = vk::DescriptorType::eStorageBuffer;
    writes[0].descriptorCount = 1;
    writes[0].pBufferInfo = &materialBufferInfo;

    writes[1].dstSet = *frameObjects[i].descriptorSet;
    writes[1].dstBinding = 2;
    writes[1].dstArrayElement = 0;
    writes[1].descriptorType = vk::DescriptorType::eStorageBuffer;
    writes[1].descriptorCount = 1;
    writes[1].pBufferInfo = &lightBufferInfo;

    device->updateDescriptorSets(writes.size(), writes.data(), 0, nullptr);
  }
}

void AsyVkRender::updateSceneDependentBuffers()
{
  fragmentBufferSize = maxFragments*sizeof(vec4);
  fragmentBf = createBufferUnique(
    vk::BufferUsageFlagBits::eStorageBuffer,
    VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
    fragmentBufferSize,
    0,
    VMA_MEMORY_USAGE_AUTO,
    VARIABLE_NAME(fragmentBf));

  depthBufferSize = maxFragments*sizeof(float);
  depthBf = createBufferUnique(
          vk::BufferUsageFlagBits::eStorageBuffer,
          VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
          depthBufferSize,
          0,
          VMA_MEMORY_USAGE_AUTO,
          VARIABLE_NAME(depthBf));

  // Create a vector to batch all descriptor writes
  std::vector<vk::WriteDescriptorSet> batchedWrites;
  batchedWrites.reserve(maxFramesInFlight * 2); // Pre-allocate space

  // Prepare buffer infos after creating the buffers
  auto fragmentBufferInfo = vk::DescriptorBufferInfo(
    fragmentBf.getBuffer(), 0, fragmentBufferSize);
  auto depthBufferInfo = vk::DescriptorBufferInfo(
    depthBf.getBuffer(), 0, depthBufferSize);

  for(auto i = 0; i < maxFramesInFlight; i++) {
    // Create and add fragment buffer write
    vk::WriteDescriptorSet fragmentWrite(
      *frameObjects[i].descriptorSet,
      5,
      0,
      1,
      vk::DescriptorType::eStorageBuffer,
      nullptr,
      &fragmentBufferInfo,
      nullptr
    );
    batchedWrites.push_back(fragmentWrite);

    // Create and add depth buffer write - this is the one that was missing
    vk::WriteDescriptorSet depthWrite(
      *frameObjects[i].descriptorSet,
      6,
      0,
      1,
      vk::DescriptorType::eStorageBuffer,
      nullptr,
      &depthBufferInfo,
      nullptr
    );
    batchedWrites.push_back(depthWrite);
  }

  // Update all descriptor sets in a single call
  if (!batchedWrites.empty()) {
    device->updateDescriptorSets(batchedWrites.size(), batchedWrites.data(), 0, nullptr);
  }
}

void AsyVkRender::createBuffers()
{
  feedbackBufferSize=2*sizeof(std::uint32_t);
  elementBufferSize=sizeof(std::uint32_t);

  feedbackBf = createBufferUnique(
    vk::BufferUsageFlagBits::eStorageBuffer | vk::BufferUsageFlagBits::eTransferSrc,
    VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_CACHED_BIT,
    feedbackBufferSize,
    VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT,
    VMA_MEMORY_USAGE_AUTO,
    VARIABLE_NAME(feedbackBf)
    );
  feedbackMappedPtr=make_unique<vma::cxx::MemoryMapperLock>(feedbackBf);

  if(GPUcompress)
  {
    elementBf= createBufferUnique(
            vk::BufferUsageFlagBits::eStorageBuffer | vk::BufferUsageFlagBits::eTransferSrc,
            VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_CACHED_BIT,
            elementBufferSize,
            VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT,
    VMA_MEMORY_USAGE_AUTO,
      VARIABLE_NAME(elementBf)
    );
    elemBfMappedMem=make_unique<vma::cxx::MemoryMapperLock>(elementBf);
  }

  for (auto& frameObj : frameObjects) {
    frameObj.uboBf = createBufferUnique(
      vk::BufferUsageFlagBits::eUniformBuffer,
      VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
      sizeof(UniformBufferObject),
      VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT,
      VMA_MEMORY_USAGE_AUTO,
      VARIABLE_NAME(frameObj.uboBf)
    );
    frameObj.uboMappedMemory = make_unique<vma::cxx::MemoryMapperLock>(frameObj.uboBf);
  }

  createMaterialAndLightBuffers();
  createDependentBuffers();
}


void AsyVkRender::createMaterialAndLightBuffers() {
  if(nmaterials > 0)
    materialBf = createBufferUnique(
      vk::BufferUsageFlagBits::eStorageBuffer | vk::BufferUsageFlagBits::eTransferDst,
      VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
      sizeof(camp::Material) * nmaterials,
      0,
      VMA_MEMORY_USAGE_AUTO,
      VARIABLE_NAME(materialBf));

  if(nlights > 0)
    lightBf = createBufferUnique(
      vk::BufferUsageFlagBits::eStorageBuffer | vk::BufferUsageFlagBits::eTransferDst,
      VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
      sizeof(camp::Light) * nlights,
      0,
      VMA_MEMORY_USAGE_AUTO,
      VARIABLE_NAME(lightBf));
}

void AsyVkRender::createImmediateRenderTargets()
{
  // Choose post-process format: FXAA requires RGBA8 to match layout rgba8
  postProcFormat = fxaa ? vk::Format::eR8G8B8A8Unorm : backbufferImageFormat;
  immRenderTargetViews.clear();
  immediateRenderTargetImgs.clear();
  prePresentationImages.clear();
  prePresentationImgViews.clear();
  immRenderTargetSampler.clear();

  auto const framebufferSize= backbufferImages.size();

  immRenderTargetViews.reserve(framebufferSize);
  immediateRenderTargetImgs.reserve(framebufferSize);
  prePresentationImages.reserve(framebufferSize);
  prePresentationImgViews.reserve(framebufferSize);
  immRenderTargetSampler.reserve(framebufferSize);

  for (size_t i= 0; i < framebufferSize; ++i)
  {
    // for immediate render target (after pixel shader)
    auto const& immRenderTarget= immediateRenderTargetImgs.emplace_back(createImage(
            backbufferExtent.width,
            backbufferExtent.height,
            vk::SampleCountFlagBits::e1,
            postProcFormat,
            vk::ImageUsageFlagBits::eColorAttachment
                    | vk::ImageUsageFlagBits::eSampled
                    | vk::ImageUsageFlagBits::eStorage,
            VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT
    ));

    setDebugObjectName(vk::Image(immRenderTarget.getImage()), "immediateRenderTargetImg" + std::to_string(i));

    auto& immRenderImgView= immRenderTargetViews.emplace_back();
    createImageView(
            postProcFormat,
            vk::ImageAspectFlagBits::eColor,
            immRenderTarget.getImage(),
            immRenderImgView
    );
    setDebugObjectName(*immRenderImgView, "immediateRenderTargetImgView" + std::to_string(i));

    // for sampling imm render target
    auto& sampler = immRenderTargetSampler.emplace_back(device->createSamplerUnique(vk::SamplerCreateInfo(
            {},
            vk::Filter::eLinear, vk::Filter::eLinear, vk::SamplerMipmapMode::eNearest,
            vk::SamplerAddressMode::eClampToEdge, vk::SamplerAddressMode::eClampToEdge,
            vk::SamplerAddressMode::eClampToEdge,
            0.f, false, 0.0, false, vk::CompareOp::eNever, 0.0, 0.0, vk::BorderColor::eFloatTransparentBlack,
            true
    )));
    setDebugObjectName(*sampler, "immRtImgSampler" + std::to_string(i));


    // for pre-presentation (after post-processing)
    auto const& prePresentationTarget= prePresentationImages.emplace_back(createImage(
      backbufferExtent.width,
      backbufferExtent.height,
      vk::SampleCountFlagBits::e1,
      postProcFormat,
      vk::ImageUsageFlagBits::eTransferSrc | vk::ImageUsageFlagBits::eStorage,
      VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT
    ));

    auto& prePresentationImageView= prePresentationImgViews.emplace_back();
    createImageView(
            postProcFormat,
            vk::ImageAspectFlagBits::eColor,
            prePresentationTarget.getImage(),
            prePresentationImageView
    );

    setDebugObjectName(vk::Image(prePresentationTarget.getImage()), "prePresentationTarget" + std::to_string(i));
    setDebugObjectName(*prePresentationImageView, "prePresentationImgView" + std::to_string(i));
  }
}

void AsyVkRender::createDependentBuffers()
{
  prepareScene(); // Determine whether the scene is opaque.
  redisplay=true;

  if(Opaque == 1)
    return;

  pixels=(backbufferExtent.width+1)*(backbufferExtent.height+1);

  std::uint32_t G=ceilquotient(pixels,groupSize);
  std::uint32_t Pixels=groupSize*G;
  globalSize=localSize*ceilquotient(G,localSize)*sizeof(std::uint32_t);

  countBufferSize=(Pixels+1)*sizeof(std::uint32_t);
  offsetBufferSize=(Pixels+2)*sizeof(std::uint32_t);
  opaqueBufferSize=pixels*sizeof(vec4);
  opaqueDepthBufferSize=sizeof(std::uint32_t)+pixels*sizeof(float);
  indexBufferSize=pixels*sizeof(std::uint32_t);

  VkMemoryPropertyFlags countBufferFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
  VmaAllocationCreateFlags vmaFlags = 0;

  countBf = createBufferUnique(
          vk::BufferUsageFlagBits::eStorageBuffer
                  | vk::BufferUsageFlagBits::eTransferDst
                  | vk::BufferUsageFlagBits::eTransferSrc,
          countBufferFlags,
          countBufferSize,
          vmaFlags,
          VMA_MEMORY_USAGE_AUTO,
          VARIABLE_NAME(countBf)
          );

  auto usageflags=vk::BufferUsageFlagBits::eStorageBuffer |
    vk::BufferUsageFlagBits::eTransferDst;

  globalSumBf = createBufferUnique(
    usageflags,
          VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
          globalSize,
          vmaFlags,
          VMA_MEMORY_USAGE_AUTO,
          VARIABLE_NAME(globalSumBf));

  offsetBf = createBufferUnique(
    usageflags,
          VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
          offsetBufferSize,
          vmaFlags,
          VMA_MEMORY_USAGE_AUTO,
          VARIABLE_NAME(offsetBf));

  opaqueBf = createBufferUnique(
                     vk::BufferUsageFlagBits::eStorageBuffer,
                     VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
                     opaqueBufferSize,
                     vmaFlags,
                     VMA_MEMORY_USAGE_AUTO,
                     VARIABLE_NAME(opaqueBf));

  opaqueDepthBf = createBufferUnique(
    usageflags,
                     VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
                     opaqueBufferSize,
                     vmaFlags,
                     VMA_MEMORY_USAGE_AUTO,
                     VARIABLE_NAME(opaqueDepthBf));

  if(GPUcompress) {
    indexBf = createBufferUnique(
      usageflags,
      VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
      indexBufferSize,
      VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT,
      VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE,
      VARIABLE_NAME(indexBf));
  }

  zeroTransparencyBuffers();
}

void AsyVkRender::initIBL() {

  string imageDir=settings::locateFile(settings::getSetting<string>("imageDir"))+"/";
  string imagePath=imageDir+settings::getSetting<string>("image")+"/";

  auto const createReflectionSampler = [=](
    vma::cxx::UniqueImage& uniqueImg,
    vk::UniqueImageView& imageView,
    vk::UniqueSampler& sampler,
    std::vector<string> texturePaths
  ) {

    auto const imageType = texturePaths.size() > 1 ? vk::ImageType::e3D : vk::ImageType::e2D;
    auto const imageViewType = texturePaths.size() > 1 ? vk::ImageViewType::e3D : vk::ImageViewType::e2D;
    auto offset = 0;
    for (auto const& f: texturePaths) {

      camp::IEXRFile texture(f);

      auto && w = texture.size().first;
      auto && h = texture.size().second;

      if (uniqueImg.getImage() == VK_NULL_HANDLE) {

        uniqueImg = createImage(
                w, h,
                vk::SampleCountFlagBits::e1,
                vk::Format::eR32G32B32A32Sfloat,
                vk::ImageUsageFlagBits::eSampled | vk::ImageUsageFlagBits::eTransferDst,
                VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
                imageType,
                texturePaths.size()
        );
        transitionImageLayout(vk::ImageLayout::eUndefined, vk::ImageLayout::eTransferDstOptimal, uniqueImg.getImage());
      }

      copyDataToImage(texture.getData(),
                      sizeof(vec4) * w * h,
                      uniqueImg.getImage(),
                      w, h,
                      {0, 0, offset++});
    }

    transitionImageLayout(vk::ImageLayout::eTransferDstOptimal, vk::ImageLayout::eShaderReadOnlyOptimal, uniqueImg.getImage());
    createImageView(vk::Format::eR32G32B32A32Sfloat, vk::ImageAspectFlagBits::eColor, uniqueImg.getImage(), imageView, imageViewType);
    createImageSampler(sampler);
  };

  createReflectionSampler(
    irradianceImg,
    irradianceView,
    irradianceSampler,
    {imagePath+"diffuse.exr"}
  );

  createReflectionSampler(
    brdfImg,
    brdfView,
    brdfSampler,
    {imageDir+"refl.exr"}
  );

  std::vector<string> files;

  constexpr auto NTEXTURES=11;
  for(auto i = 0; i < NTEXTURES; ++i) {

    files.emplace_back(imagePath+"refl"+std::to_string(i).c_str()+".exr");
  }

  createReflectionSampler(
    reflectionImg,
    reflectionView,
    reflectionSampler,
    files
  );
}

void AsyVkRender::createCountRenderPass()
{
  std::array<vk::SubpassDescription2, 3> subpasses;

  subpasses[0] = vk::SubpassDescription2(
    vk::SubpassDescriptionFlags(),
    vk::PipelineBindPoint::eGraphics,
    0,
    0,
    nullptr,
    0,
    nullptr,
    nullptr,
    nullptr,
    0,
    nullptr,
    nullptr
  );
  subpasses[1] = vk::SubpassDescription2(
    vk::SubpassDescriptionFlags(),
    vk::PipelineBindPoint::eGraphics,
    0,
    0,
    nullptr,
    0,
    nullptr,
    nullptr,
    nullptr,
    0,
    nullptr,
    nullptr
  );
  subpasses[2] = vk::SubpassDescription2(
    vk::SubpassDescriptionFlags(),
    vk::PipelineBindPoint::eGraphics,
    0,
    0,
    nullptr,
    0,
    nullptr,
    nullptr,
    nullptr,
    0,
    nullptr,
    nullptr
  );

  std::array<vk::SubpassDependency2, 3> dependencies;

  dependencies[0] = vk::SubpassDependency2(
    VK_SUBPASS_EXTERNAL,
    0,
    vk::PipelineStageFlagBits::eColorAttachmentOutput,
    vk::PipelineStageFlagBits::eFragmentShader,
    vk::AccessFlagBits::eNone,
    vk::AccessFlagBits::eNone
  );
  dependencies[1] = vk::SubpassDependency2(
    0,
    1,
    vk::PipelineStageFlagBits::eFragmentShader,
    vk::PipelineStageFlagBits::eFragmentShader,
    vk::AccessFlagBits::eNone,
    vk::AccessFlagBits::eNone
  );
  dependencies[2] = vk::SubpassDependency2(
    1,
    2,
    vk::PipelineStageFlagBits::eBottomOfPipe,
    vk::PipelineStageFlagBits::eFragmentShader,
    vk::AccessFlagBits::eMemoryWrite,
    vk::AccessFlagBits::eMemoryRead
  );

  auto renderPassCI = vk::RenderPassCreateInfo2(
    vk::RenderPassCreateFlags(),
    0,
    nullptr,
    subpasses.size(),
    subpasses.data(),
    dependencies.size(),
    dependencies.data()
  );

  countRenderPass = device->createRenderPass2Unique(renderPassCI);

  if (!countRenderPass)
    runtimeError("failed to create the count render pass");
}

void AsyVkRender::createGraphicsRenderPass()
{
  auto colorAttachment = vk::AttachmentDescription2(
    vk::AttachmentDescriptionFlags(),
    postProcFormat,
    msaaSamples,
    vk::AttachmentLoadOp::eClear,
    vk::AttachmentStoreOp::eStore,
    vk::AttachmentLoadOp::eDontCare,
    vk::AttachmentStoreOp::eDontCare,
    vk::ImageLayout::eUndefined,
    vk::ImageLayout::eColorAttachmentOptimal
  );

  // If we are using fxaa, the output needs to be eGeneral
  // since we are passing that to fxaa compute shader, otherwise
  // we can go to presentSrc since we are passing it to the swap chain

  // Again, we should really be using scene graphs here. The
  // code will only get more complicated from now on...
  vk::ImageLayout colorAttachmentFinalLayout = fxaa ?
    vk::ImageLayout::eGeneral :
    (View ? vk::ImageLayout::ePresentSrcKHR :
     vk::ImageLayout::eColorAttachmentOptimal);


  auto colorResolveAttachment = vk::AttachmentDescription2(
    vk::AttachmentDescriptionFlags(),
    postProcFormat,
    vk::SampleCountFlagBits::e1,
    vk::AttachmentLoadOp::eDontCare,
    vk::AttachmentStoreOp::eStore,
    vk::AttachmentLoadOp::eDontCare,
    vk::AttachmentStoreOp::eDontCare,
    vk::ImageLayout::eUndefined,
          colorAttachmentFinalLayout
  );
  auto depthAttachment = vk::AttachmentDescription2(
    vk::AttachmentDescriptionFlags(),
    vk::Format::eD32Sfloat,
    msaaSamples,
    vk::AttachmentLoadOp::eClear,
    vk::AttachmentStoreOp::eDontCare,
    vk::AttachmentLoadOp::eDontCare,
    vk::AttachmentStoreOp::eDontCare,
    vk::ImageLayout::eUndefined,
    vk::ImageLayout::eDepthStencilAttachmentOptimal
  );

  auto colorAttachmentRef = vk::AttachmentReference2(0, vk::ImageLayout::eColorAttachmentOptimal);
  auto depthAttachmentRef = vk::AttachmentReference2(1, vk::ImageLayout::eDepthStencilAttachmentOptimal);
  auto colorResolveAttachmentRef= vk::AttachmentReference2(2, vk::ImageLayout::eColorAttachmentOptimal);

  std::vector subpasses{
          vk::SubpassDescription2(
                  {},
                  vk::PipelineBindPoint::eGraphics,
                  0,
                  0,
                  nullptr,
                  1,
                  &colorAttachmentRef,
                  &colorResolveAttachmentRef,
                  &depthAttachmentRef
          ),
          vk::SubpassDescription2({}, vk::PipelineBindPoint::eGraphics, 0, 0, nullptr, 0, nullptr, nullptr, nullptr),
          // Subpass 2: Final blend. Writes color, no depth test.
          vk::SubpassDescription2({}, vk::PipelineBindPoint::eGraphics, 0, 0, nullptr, 1, &colorResolveAttachmentRef)
  };
  if (msaaSamples == vk::SampleCountFlagBits::e1)
  {
    colorAttachment.loadOp= vk::AttachmentLoadOp::eDontCare;
    colorResolveAttachment.loadOp = vk::AttachmentLoadOp::eClear;

    subpasses[0].pColorAttachments = &colorResolveAttachmentRef;
    subpasses[0].pResolveAttachments = nullptr;
    subpasses[2].pResolveAttachments = nullptr;
  }

  std::vector const attachments
  {
    colorAttachment,
    depthAttachment,
    colorResolveAttachment
  };

  std::vector const dependencies{
          vk::SubpassDependency2(
                  VK_SUBPASS_EXTERNAL,
                  0,
                  vk::PipelineStageFlagBits::eColorAttachmentOutput,
                  vk::PipelineStageFlagBits::eColorAttachmentOutput,
                  vk::AccessFlagBits::eNone,
                  vk::AccessFlagBits::eNone
          ),
          vk::SubpassDependency2(
                   0, // from opaque
                   1, // to transparent
                   vk::PipelineStageFlagBits::eLateFragmentTests,
                   vk::PipelineStageFlagBits::eEarlyFragmentTests,
                   vk::AccessFlagBits::eDepthStencilAttachmentWrite,
                   vk::AccessFlagBits::eDepthStencilAttachmentRead
          ),
          // Dependency from transparent pass to blend pass
          vk::SubpassDependency2(
                   1, // from transparent
                   2, // to blend
                   vk::PipelineStageFlagBits::eColorAttachmentOutput,
                   vk::PipelineStageFlagBits::eFragmentShader,
                   vk::AccessFlagBits::eColorAttachmentWrite,
                   vk::AccessFlagBits::eShaderRead
          )
  };

  // only use the first subpass and first dependency
  auto const opaqueRenderPassCI=
          vk::RenderPassCreateInfo2({}, VEC_VIEW(attachments), 1, subpasses.data(), 1, dependencies.data());
  opaqueGraphicsRenderPass= device->createRenderPass2Unique(opaqueRenderPassCI);
  setDebugObjectName(*opaqueGraphicsRenderPass, "opaqueGraphicsRenderPass");

  if (!opaqueGraphicsRenderPass)
    runtimeError("failed to create the opaque render pass");

  auto renderPassCI = vk::RenderPassCreateInfo2(
    vk::RenderPassCreateFlags(),
    VEC_VIEW(attachments),
    VEC_VIEW(subpasses),
    VEC_VIEW(dependencies)
  );
  graphicsRenderPass = device->createRenderPass2Unique(renderPassCI);

  if (!graphicsRenderPass)
    runtimeError("failed to create the graphics render pass");
  setDebugObjectName(*graphicsRenderPass, "graphicsRenderPass");
}

void AsyVkRender::createGraphicsPipelineLayout()
{
  auto flagsPushConstant = vk::PushConstantRange(
    vk::ShaderStageFlagBits::eFragment,
    0,
    sizeof(PushConstants)
  );

  auto pipelineLayoutCI = vk::PipelineLayoutCreateInfo(
    vk::PipelineLayoutCreateFlags(),
    1,
    &*materialDescriptorSetLayout,
    1,
    &flagsPushConstant
  );

  graphicsPipelineLayout = device->createPipelineLayoutUnique(pipelineLayoutCI, nullptr);
}

void AsyVkRender::modifyShaderOptions(std::vector<std::string>& options, PipelineType type) {
  if (type != PIPELINE_COUNT)
    options.emplace_back("MATERIAL");

  if (ibl) {
    options.emplace_back("USE_IBL");
  }
  if (orthographic) {
    options.emplace_back("ORTHOGRAPHIC");
  }

  if (fxaa)
  {
    options.emplace_back("ENABLE_FXAA");
  }

  if (srgb)
  {
    options.emplace_back("OUTPUT_AS_SRGB");
  }

  if (type == PIPELINE_OPAQUE) {
    options.emplace_back("OPAQUE");
    return;
  }

  if (GPUcompress) {
    options.emplace_back("GPUCOMPRESS");
  }

  // from now on, only things relevant to compute
  if (interlock) {
    options.emplace_back("HAVE_INTERLOCK");
  }

  options.emplace_back("LOCALSIZE " + std::to_string(localSize));
  options.emplace_back("BLOCKSIZE " + std::to_string(blockSize));
  options.emplace_back("ARRAYSIZE " + std::to_string(maxSize));
}

template<typename V>
void AsyVkRender::createGraphicsPipeline(PipelineType type, vk::UniquePipeline & graphicsPipeline, vk::PrimitiveTopology topology,
                                         vk::PolygonMode fillMode, std::vector<std::string> options,
                                         std::string const & name,
                                         std::string const & vertexShader,
                                         std::string const & fragmentShader,
                                         int graphicsSubpass, bool enableDepthWrite,
                                         bool transparent, bool disableMultisample)
{
  std::string vertShaderName = SHADER_DIRECTORY + vertexShader + ".glsl";
  std::string fragShaderName = SHADER_DIRECTORY + fragmentShader + ".glsl";

  bool width=topology == vk::PrimitiveTopology::ePointList;

  if (type == PIPELINE_COUNT) {
    vertShaderName = SHADER_DIRECTORY "vertex.glsl";
    fragShaderName = SHADER_DIRECTORY "count.glsl";
    if(width)
      options.emplace_back("WIDTH");
    if(GPUcompress)
      options.emplace_back("GPUCOMPRESS");
  } else
    modifyShaderOptions(options, type);

  auto vertShaderModule = createShaderModule(EShLangVertex, vertShaderName, options);
  auto fragShaderModule = createShaderModule(EShLangFragment, fragShaderName, options);

  auto specializationInfo = vk::SpecializationInfo();

  auto vertShaderStageCI = vk::PipelineShaderStageCreateInfo(
    vk::PipelineShaderStageCreateFlags(),
    vk::ShaderStageFlagBits::eVertex,
    *vertShaderModule,
    "main",
    &specializationInfo
  );
  auto fragShaderStageCI = vk::PipelineShaderStageCreateInfo(
    vk::PipelineShaderStageCreateFlags(),
    vk::ShaderStageFlagBits::eFragment,
    *fragShaderModule,
    "main",
    &specializationInfo
  );

  vk::PipelineShaderStageCreateInfo stages[] = {vertShaderStageCI, fragShaderStageCI};

  // Create vertex input state based on shader type
  vk::PipelineVertexInputStateCreateInfo vertexInputCI;
  vk::VertexInputBindingDescription bindingDescription;
  std::vector<vk::VertexInputAttributeDescription> attributeDescriptions;

  if (vertexShader == "screen") {
    // For screen shader, use empty vertex input state
    vertexInputCI = vk::PipelineVertexInputStateCreateInfo();
  } else {
    // For all other shaders, get the binding description and attribute descriptions
    bindingDescription = VertexInputTraits<V>::binding();
    attributeDescriptions = VertexInputTraits<V>::attributes(type == PIPELINE_COUNT);

    vertexInputCI = vk::PipelineVertexInputStateCreateInfo(
      vk::PipelineVertexInputStateCreateFlags(),
      1,
      &bindingDescription,
      static_cast<uint32_t>(attributeDescriptions.size()),
      attributeDescriptions.data()
    );
  }

  auto inputAssemblyCI = vk::PipelineInputAssemblyStateCreateInfo(
    vk::PipelineInputAssemblyStateCreateFlags(),
    topology,
    VK_FALSE
  );

  // Set origin at lower-left corner with y coordinate increasing up
  auto viewport = vk::Viewport(
    0.0f,
    static_cast<float>(backbufferExtent.height),
    static_cast<float>(backbufferExtent.width),
    -static_cast<float>(backbufferExtent.height),
    0.0f,
    1.0f
  );

  auto scissor = vk::Rect2D(
    vk::Offset2D(0, 0),
    backbufferExtent
  );

  auto viewportStateCI = vk::PipelineViewportStateCreateInfo(
    vk::PipelineViewportStateCreateFlags(),
    1,
    &viewport,
    1,
    &scissor
  );

  auto rasterizerCI = vk::PipelineRasterizationStateCreateInfo(
    vk::PipelineRasterizationStateCreateFlags(),
    VK_FALSE,
    VK_FALSE,
    fillMode,
    vk::CullModeFlagBits::eNone,
    vk::FrontFace::eCounterClockwise,
    VK_FALSE,
    0.0f,
    0.0f,
    0.0f,
    1.0f
  );

  auto multisamplingCI = vk::PipelineMultisampleStateCreateInfo(
    vk::PipelineMultisampleStateCreateFlags(),
    transparent || disableMultisample ? vk::SampleCountFlagBits::e1 : msaaSamples,
    VK_FALSE,
    0.0f,
    nullptr,
    VK_FALSE,
    VK_FALSE
  );

  auto colorBlendAttachment = vk::PipelineColorBlendAttachmentState(
    VK_FALSE,
    vk::BlendFactor::eZero,
    vk::BlendFactor::eZero,
    vk::BlendOp::eAdd,
    vk::BlendFactor::eZero,
    vk::BlendFactor::eZero,
    vk::BlendOp::eAdd,
    vk::ColorComponentFlagBits::eR | vk::ColorComponentFlagBits::eG | vk::ColorComponentFlagBits::eB | vk::ColorComponentFlagBits::eA
  );

  auto colorBlendCI = vk::PipelineColorBlendStateCreateInfo(
    vk::PipelineColorBlendStateCreateFlags(),
    VK_FALSE,
    vk::LogicOp::eCopy,
    1,
    &colorBlendAttachment,
    {0.0f, 0.0f, 0.0f, 0.0f}
  );

  auto depthStencilCI = vk::PipelineDepthStencilStateCreateInfo();

  depthStencilCI.depthCompareOp = vk::CompareOp::eLess;
  depthStencilCI.depthBoundsTestEnable = VK_FALSE;
  depthStencilCI.minDepthBounds = 0.f;
  depthStencilCI.maxDepthBounds = 1.f;
  depthStencilCI.stencilTestEnable = VK_FALSE;

  vk::RenderPass renderPass;

  switch(type) {
    case PIPELINE_OPAQUE:
      renderPass=*opaqueGraphicsRenderPass;
      depthStencilCI.depthTestEnable=VK_TRUE;
      depthStencilCI.depthWriteEnable=enableDepthWrite;
      break;
    case PIPELINE_COUNT:
    case PIPELINE_COMPRESS:
      renderPass=*countRenderPass;
      depthStencilCI.depthTestEnable=VK_FALSE;
      depthStencilCI.depthWriteEnable=VK_FALSE;
      break;
    default:
      renderPass=*graphicsRenderPass;
      depthStencilCI.depthTestEnable=VK_TRUE;
      depthStencilCI.depthWriteEnable=enableDepthWrite;
      break;
  }

  auto pipelineCI = vk::GraphicsPipelineCreateInfo(
    vk::PipelineCreateFlags(),
    2,
    stages,
    &vertexInputCI,
    &inputAssemblyCI,
    nullptr,
    &viewportStateCI,
    &rasterizerCI,
    &multisamplingCI,
    &depthStencilCI,
    &colorBlendCI,
    nullptr,
    *graphicsPipelineLayout,
    renderPass,
    graphicsSubpass,
    nullptr
  );

  auto result = device->createGraphicsPipelineUnique(nullptr, pipelineCI, nullptr);
  if (result.result != vk::Result::eSuccess)
    runtimeError("failed to create pipeline");
  else
  {
    graphicsPipeline = std::move(result.value);
    setDebugObjectName(*graphicsPipeline, name);
  }
}

template<typename V>
void AsyVkRender::createGraphicsPipeline(PipelineType type, vk::UniquePipeline& graphicsPipeline, const AsyVkRender::PipelineConfig& config)
{
    createGraphicsPipeline<V>(
        type,
        graphicsPipeline,
        config.topology,
        config.fillMode,
        type == PIPELINE_COUNT ? countShaderOptions : config.shaderOptions,
        config.namePrefix,
        config.vertexShader,
        config.fragmentShader,
        config.graphicsSubpass,
        config.enableDepthWrite,
        config.transparent,
        config.disableMultisample
    );
}

template<typename V>
void AsyVkRender::createPipelineSet(
    std::array<vk::UniquePipeline, PIPELINE_MAX>& pipelines,
    const AsyVkRender::PipelineConfig& config,
    PipelineType start,
    PipelineType end)
{
    for (auto u = static_cast<unsigned>(start); u < static_cast<unsigned>(end); u++) {
        createGraphicsPipeline<V>(
            static_cast<PipelineType>(u),
            pipelines[u],
            config.topology,
            config.fillMode,
            u == PIPELINE_COUNT ? countShaderOptions : config.shaderOptions,
            config.namePrefix + std::to_string(u),
            config.vertexShader,
            config.fragmentShader,
            config.graphicsSubpass,
            config.enableDepthWrite,
            config.transparent,
            config.disableMultisample
        );
    }
}

void AsyVkRender::createGraphicsPipelines()
{
  fpu_trap(false); // Work around FE_INVALID
  auto const drawMode =
    (mode == DRAWMODE_WIREFRAME || mode == DRAWMODE_OUTLINE)
    ? vk::PolygonMode::eLine
    : vk::PolygonMode::eFill;

  std::vector<PipelineConfig> configs = {
    // Material triangles
    {
      vk::PrimitiveTopology::eTriangleList, drawMode, materialShaderOptions,
      "materialPipeline", "vertex", "fragment", 0, true, false, false
    },
    // Color triangles
    {
      vk::PrimitiveTopology::eTriangleList, drawMode, colorShaderOptions,
      "colorPipeline", "vertex", "fragment", 0, true, false, false
    },
    // Triangle groups
    {
      vk::PrimitiveTopology::eTriangleList, drawMode, triangleShaderOptions,
      "trianglePipeline", "vertex", "fragment", 0, true, false, false
    },
    // Lines
    {
      vk::PrimitiveTopology::eLineList, vk::PolygonMode::eLine, materialShaderOptions,
      "linePipeline", "vertex", "fragment", 0, true, false, false
    },
    // Points
    {
      vk::PrimitiveTopology::ePointList,
#ifdef __APPLE__
      vk::PolygonMode::eFill,
#else
      vk::PolygonMode::ePoint,
#endif
      pointShaderOptions, "pointPipeline", "vertex", "fragment", 0, true, false, false
    }
  };

  createPipelineSet<MaterialVertex>(materialPipelines, configs[0]);
  createPipelineSet<ColorVertex>(colorPipelines, configs[1]);
  createPipelineSet<ColorVertex>(trianglePipelines, configs[2]);
  createPipelineSet<MaterialVertex>(linePipelines, configs[3]);
  createPipelineSet<PointVertex>(pointPipelines, configs[4]);

  // Create pipelines for transparent triangles
  PipelineConfig transparentConfig = {
      vk::PrimitiveTopology::eTriangleList, drawMode, transparentShaderOptions,
      "transparentPipeline", "vertex", "fragment", 1, false, true, false
  };
  createPipelineSet<ColorVertex>(transparentPipelines, transparentConfig, PIPELINE_TRANSPARENT, PIPELINE_MAX);

  static std::vector<std::string> emptyOptions;
  PipelineConfig compressConfig = {
      vk::PrimitiveTopology::eTriangleList, vk::PolygonMode::eFill, emptyOptions,
      "compressPipeline", "screen", "compress", 2, false, false, true
  };
  createGraphicsPipeline<ColorVertex>(PIPELINE_COMPRESS, compressPipeline, compressConfig);

  createBlendPipeline();
  fpu_trap(settings::trap());
}

void AsyVkRender::setupPostProcessingComputeParameters()
{
// TODO: We should share this constant with the shader code & C++ side")
  uint32_t constexpr localGroupSize=20;

  postProcessThreadGroupCount.width=ceilquotient(backbufferExtent.width, localGroupSize);
  postProcessThreadGroupCount.height=ceilquotient(backbufferExtent.height, localGroupSize);
}

void AsyVkRender::createBlendPipeline() {

  static std::vector<std::string> emptyOptions;
  PipelineConfig blendConfig = {
      vk::PrimitiveTopology::eTriangleList, vk::PolygonMode::eFill, emptyOptions,
      "blendPipeline", "screen", "blend", 2, false, false, true
  };
  createGraphicsPipeline<ColorVertex>(PIPELINE_DONTCARE, blendPipeline, blendConfig);
}

void AsyVkRender::createComputePipeline(
  vk::UniquePipelineLayout& layout,
  vk::UniquePipeline& pipeline,
  std::string const& shaderFile,
  std::vector<vk::DescriptorSetLayout> const& descSetLayout
)
{
  auto miscConstant = vk::PushConstantRange(
    vk::ShaderStageFlagBits::eCompute,
    0,
    sizeof(ComputePushConstants)
  );

  auto pipelineLayoutCI = vk::PipelineLayoutCreateInfo(
    vk::PipelineLayoutCreateFlags(),
    VEC_VIEW(descSetLayout),
    0,
    nullptr
  );

  pipelineLayoutCI.pPushConstantRanges = &miscConstant;
  pipelineLayoutCI.pushConstantRangeCount = 1;

  layout = device->createPipelineLayoutUnique(pipelineLayoutCI, nullptr);

  createComputePipelineOnly(*layout, pipeline, shaderFile);
}

// Create a compute pipeline using an existing layout (does NOT create a new layout).
// This is needed when multiple compute pipelines share the same VkPipelineLayout,
// since createComputePipeline() would otherwise destroy and recreate the layout
// on each call, leaving earlier pipelines referencing a destroyed object.
void AsyVkRender::createComputePipelineOnly(
  vk::PipelineLayout layout,
  vk::UniquePipeline& pipeline,
  std::string const& shaderFile
)
{
  auto const filename = SHADER_DIRECTORY + shaderFile + ".glsl";

  std::vector<std::string> options;
  modifyShaderOptions(options, PIPELINE_DONTCARE);

  vk::UniqueShaderModule computeShaderModule = createShaderModule(EShLangCompute, filename, options);

  auto computeShaderStageInfo = vk::PipelineShaderStageCreateInfo(
    vk::PipelineShaderStageCreateFlags(),
    vk::ShaderStageFlagBits::eCompute,
    *computeShaderModule,
    "main"
  );

  auto computePipelineCI = vk::ComputePipelineCreateInfo();
  computePipelineCI.layout = layout;
  computePipelineCI.stage = computeShaderStageInfo;

  auto result = device->createComputePipelineUnique(VK_NULL_HANDLE, computePipelineCI);
  if (result.result != vk::Result::eSuccess)
    runtimeError("failed to create compute pipeline");
  else
    pipeline = std::move(result.value);
}

void AsyVkRender::createComputePipelines()
{
  std::vector const computeDescSetLayoutVec { *computeDescriptorSetLayout };

  // Destroy old pipelines before destroying the old layout, to prevent
  // the validation layer from reporting "pipeline references deleted
  // VkPipelineLayout" when recreating swap chain / pipelines.
  sum1Pipeline.reset();
  sum2Pipeline.reset();
  sum3Pipeline.reset();

  // Create the shared pipeline layout only once, then create all three
  // pipelines using it.  Previously each call to createComputePipeline()
  // created a new layout and destroyed the old one, leaving sum1Pipeline
  // and sum2Pipeline referencing a destroyed VkPipelineLayout.
  createComputePipeline(sumPipelineLayout, sum1Pipeline, "sum1", computeDescSetLayoutVec);
  createComputePipelineOnly(*sumPipelineLayout, sum2Pipeline, "sum2");
  createComputePipelineOnly(*sumPipelineLayout, sum3Pipeline, "sum3");

  if (fxaa)
  {
    // Destroy old FXAA pipeline before recreating its layout.
    postProcessPipeline.reset();

    std::vector const postProcessDescSetLayoutVec{*postProcessDescSetLayout};
    createComputePipeline(postProcessPipelineLayout, postProcessPipeline, "fxaa.cs", postProcessDescSetLayoutVec);
  }
}

void AsyVkRender::createAttachments()
{
  colorImg = createImage(backbufferExtent.width, backbufferExtent.height, msaaSamples, postProcFormat,
              vk::ImageUsageFlagBits::eTransientAttachment | vk::ImageUsageFlagBits::eColorAttachment,
              VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
  createImageView(postProcFormat, vk::ImageAspectFlagBits::eColor, colorImg.getImage(), colorImageView);
  setDebugObjectName(vk::Image(colorImg.getImage()), "colorImg");
  setDebugObjectName(*colorImageView, "colorImageView");

  depthImg = createImage(backbufferExtent.width, backbufferExtent.height, msaaSamples, vk::Format::eD32Sfloat,
          vk::ImageUsageFlagBits::eDepthStencilAttachment,
          VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT
  );
  createImageView(vk::Format::eD32Sfloat, vk::ImageAspectFlagBits::eDepth, depthImg.getImage(), depthImageView);
  setDebugObjectName(vk::Image(depthImg.getImage()), "depthImg");
  setDebugObjectName(*depthImageView, "depthImageView");

  depthResolveImg = createImage(backbufferExtent.width, backbufferExtent.height, vk::SampleCountFlagBits::e1, vk::Format::eD32Sfloat,
          vk::ImageUsageFlagBits::eDepthStencilAttachment,
          VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT
  );
  createImageView(vk::Format::eD32Sfloat, vk::ImageAspectFlagBits::eDepth, depthResolveImg.getImage(), depthResolveImageView);
  setDebugObjectName(vk::Image(depthResolveImg.getImage()), "depthResolve");
  setDebugObjectName(*depthResolveImageView, "depthResolveImageView");
}

void AsyVkRender::updateUniformBuffer(uint32_t currentFrame)
{
  if (!newUniformBuffer && !queueExport)
    return;

  UniformBufferObject ubo{ };

  // Access matrices directly to avoid synchronization
  ubo.projViewMat = mat4(getProjViewMat());
  ubo.viewMat = mat4(viewMat);
  // Fill normMat as 3 vec4 columns for std140 mat3 layout (48 bytes)
  ubo.normMat[0] = vec4(normMat[0], 0.0f);
  ubo.normMat[1] = vec4(normMat[1], 0.0f);
  ubo.normMat[2] = vec4(normMat[2], 0.0f);

  memcpy(frameObjects[currentFrame].uboMappedMemory->getCopyPtr(), &ubo, sizeof(ubo));

  newUniformBuffer = false;
}

void AsyVkRender::updateBuffers()
{
  // Don't update the material buffer if the materials aren't yet added
  bool materialsReady = !materials.empty();

  if (shouldUpdateBuffers && materialsReady) {
    std::vector<Light> lights;

    for (auto i = 0u; i < nlights; i++)
      lights.emplace_back(
        Light {
          {Lights[i].getx(), Lights[i].gety(), Lights[i].getz(), 0.f},
          {static_cast<float>(LightsDiffuse[4 * i]),
          static_cast<float>(LightsDiffuse[4 * i + 1]),
          static_cast<float>(LightsDiffuse[4 * i + 2]), 0.f}
        }
      );


    if (materials.size() > nmaterials) {
      nmaterials=materials.size();
    }

    createMaterialAndLightBuffers();
    writeMaterialAndLightDescriptors();

    if(lights.size() > 0)
      copyToBuffer(lightBf.getBuffer(), lights.data(), lights.size() * sizeof(Light));
    if(materials.size() > 0)
      copyToBuffer(materialBf.getBuffer(), materials.data(), materials.size() * sizeof(camp::Material));

    shouldUpdateBuffers=false;
  }
}

PushConstants AsyVkRender::buildPushConstants()
{
  auto pushConstants = PushConstants {};

  pushConstants.constants[0] = mode!= DRAWMODE_NORMAL ? 0 : nlights;
  pushConstants.constants[1] = backbufferExtent.width;
  pushConstants.constants[2] = backbufferExtent.height;

  for (int i = 0; i < 4; i++)
    pushConstants.background[i]=Background[i];

  return pushConstants;
}

vk::CommandBuffer & AsyVkRender::getFrameCommandBuffer()
{
  return *frameObjects[currentFrame].commandBuffer;
}

vk::CommandBuffer & AsyVkRender::getFrameComputeCommandBuffer()
{
  return *frameObjects[currentFrame].computeCommandBuffer;
}

vk::UniquePipeline & AsyVkRender::getPipelineType(std::array<vk::UniquePipeline, PIPELINE_MAX> & pipelines)
{
  return pipelines[Opaque ? PIPELINE_OPAQUE : PIPELINE_TRANSPARENT];
}

void AsyVkRender::beginFrameCommands(vk::CommandBuffer cmd)
{
  currentCommandBuffer = cmd;
  currentCommandBuffer.begin(vk::CommandBufferBeginInfo());
}

void AsyVkRender::beginCountFrameRender(int imageIndex)
{
  std::vector<vk::ClearValue> clearColors;


  auto renderPassInfo = vk::RenderPassBeginInfo(
    *countRenderPass,
    *depthFramebuffers[imageIndex],
    vk::Rect2D(vk::Offset2D(0, 0), backbufferExtent),
    clearColors.size(),
    clearColors.data()
  );

  currentCommandBuffer.beginRenderPass(renderPassInfo, vk::SubpassContents::eInline);
}

void AsyVkRender::beginGraphicsFrameRender(int imageIndex)
{
  std::array<vk::ClearValue, 3> clearColors;

  clearColors[0]= vk::ClearValue(Background);
  clearColors[1].depthStencil.depth = 1.f;
  clearColors[1].depthStencil.stencil = 0;
  clearColors[2] = vk::ClearValue(Background);

  auto renderPassInfo = vk::RenderPassBeginInfo(
    Opaque ? *opaqueGraphicsRenderPass : *graphicsRenderPass,
    Opaque ? *opaqueGraphicsFramebuffers[imageIndex] : *graphicsFramebuffers[imageIndex],
    vk::Rect2D(vk::Offset2D(0, 0), backbufferExtent),
    clearColors.size(),
    &clearColors[0]
  );

  currentCommandBuffer.beginRenderPass(renderPassInfo, vk::SubpassContents::eInline);
}

void AsyVkRender::drawBuffer(FrameBufferPair& bufpair, VertexBuffer * data, vk::Pipeline pipeline) {
  // Follow the OpenGL model: use current CPU-side index count for drawing.
  // When patches go offscreen, their vertices/indices are removed from the
  // CPU-side VertexBuffer by notRendered() + clear(). The GPU buffer may
  // still contain stale data from a previous frame, but we draw only as many
  // indices as currently exist on the CPU, matching OpenGL behavior.
  if (data->indices.empty()) {
    bufpair.nobjects = 0;
    return;
  }

  auto const badBuffer = static_cast<void*>(bufpair.vertexBuffer.getBuffer()) == nullptr;
  auto const copy = (remesh || data->renderCount < maxFramesInFlight || badBuffer) && !copied;

  if (copy) {

    if (!data->materialVertices.empty())
    {
      uploadPersistentBuffer(bufpair, data->materialVertices.data(),
                             data->materialVertices.size() * sizeof(camp::MaterialVertex),
                             0, VB_USAGE_FLAGS, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, true);
    }
    else if (!data->colorVertices.empty())
    {
      uploadPersistentBuffer(bufpair, data->colorVertices.data(),
                             data->colorVertices.size() * sizeof(camp::ColorVertex),
                             0, VB_USAGE_FLAGS, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, true);
    }
    else if(!data->pointVertices.empty())
    {
      uploadPersistentBuffer(bufpair, data->pointVertices.data(),
                             data->pointVertices.size() * sizeof(camp::PointVertex),
                             0, VB_USAGE_FLAGS, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, true);
    }
    else
      return;

    uploadPersistentBuffer(bufpair, data->indices.data(),
                           data->indices.size() * sizeof(data->indices[0]),
                           data->indices.size(), IB_USAGE_FLAGS,
                           VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, false);
  }

  std::vector<vk::Buffer> vertexBuffers = {bufpair.vertexBuffer.getBuffer()};
  std::vector<vk::DeviceSize> vertexOffsets = {0};
  auto const pushConstants = buildPushConstants();

  currentCommandBuffer.bindPipeline(vk::PipelineBindPoint::eGraphics, pipeline);
  currentCommandBuffer.bindVertexBuffers(0, vertexBuffers, vertexOffsets);
  currentCommandBuffer.bindIndexBuffer(bufpair.indexBuffer.getBuffer(), 0, vk::IndexType::eUint32);
  currentCommandBuffer.pushConstants(*graphicsPipelineLayout, vk::ShaderStageFlagBits::eFragment, 0, sizeof(PushConstants), &pushConstants);
  // Use current CPU-side index count (like OpenGL's glDrawElements(drawType, data.indices.size(), ...))
  // rather than the cached bufpair.nobjects which may be stale when copy==false.
  currentCommandBuffer.drawIndexed(data->indices.size(), 1, 0, 0, 0);
}

void AsyVkRender::endFrameRender()
{
  currentCommandBuffer.endRenderPass();
}

void AsyVkRender::endFrameCommands()
{
  currentCommandBuffer.end();
}

void AsyVkRender::endFrame(int imageIndex)
{
  endFrameRender();
  endFrameCommands();
}

void AsyVkRender::drawPoints(FrameObject & object)
{
  drawBuffer(object.pointBuffers, &pointData, *getPipelineType(pointPipelines));
  pointData.renderCount++;
}

void AsyVkRender::drawLines(FrameObject & object)
{
  drawBuffer(object.lineBuffers, &lineData, *getPipelineType(linePipelines));
  lineData.renderCount++;
}

void AsyVkRender::drawMaterials(FrameObject & object)
{
  drawBuffer(object.materialBuffers, &materialData, *getPipelineType(materialPipelines));
  materialData.renderCount++;
}

void AsyVkRender::drawColors(FrameObject & object)
{
  drawBuffer(object.colorBuffers, &colorData, *getPipelineType(colorPipelines));
  colorData.renderCount++;
}

void AsyVkRender::drawTriangles(FrameObject & object)
{
  drawBuffer(object.triangleBuffers, &triangleData, *getPipelineType(trianglePipelines));
  triangleData.renderCount++;
}

void AsyVkRender::drawTransparent(FrameObject & object)
{
  drawBuffer(object.transparentBuffers, &transparentData, *getPipelineType(transparentPipelines));
  transparentData.renderCount++;
}

void AsyVkRender::partialSums(FrameObject & object, bool timing)
{
  auto const writeBarrier=vk::MemoryBarrier(
    vk::AccessFlagBits::eShaderWrite,
    vk::AccessFlagBits::eShaderRead
  );

  vk::CommandBuffer const cmd=timing ? *object.partialSumsCommandBuffer :
    currentCommandBuffer;

  auto const blockSize=ceilquotient(g,localSize);
  auto const final=elements-1;
  ComputePushConstants pc{blockSize, final};

  cmd.pushConstants(*sumPipelineLayout,vk::ShaderStageFlagBits::eCompute,0,
                    sizeof(ComputePushConstants),&pc);

  cmd.bindDescriptorSets(vk::PipelineBindPoint::eCompute,*sumPipelineLayout,
                         0,1,&*computeDescriptorSet,0,nullptr);

  // run sum1
  // Only wait for fragment shaders if we are not timing
  if(!timing)
    cmd.pipelineBarrier(vk::PipelineStageFlagBits::eFragmentShader,
                        vk::PipelineStageFlagBits::eComputeShader,
                        { },
                        1,
                        &writeBarrier,
                        0,
                        nullptr,
                        0,
                        nullptr);

  cmd.bindPipeline(vk::PipelineBindPoint::eCompute,*sum1Pipeline);
  cmd.dispatch(g,1,1);

  // run sum2
  cmd.pipelineBarrier(vk::PipelineStageFlagBits::eComputeShader,
                      vk::PipelineStageFlagBits::eComputeShader,
                      { },
                      1,
                      &writeBarrier,
                      0,
                      nullptr,
                      0,
                      nullptr);
  cmd.bindPipeline(vk::PipelineBindPoint::eCompute,*sum2Pipeline);
  cmd.dispatch(1,1,1);

  // run sum3
  cmd.pipelineBarrier(vk::PipelineStageFlagBits::eComputeShader,
                      vk::PipelineStageFlagBits::eComputeShader,
                      { },
                      1,
                      &writeBarrier,
                      0,
                      nullptr,
                      0,
                      nullptr);
  cmd.bindPipeline(vk::PipelineBindPoint::eCompute,*sum3Pipeline);
  cmd.dispatch(g,1,1);

  if(timing)
    cmd.pipelineBarrier(vk::PipelineStageFlagBits::eComputeShader,
                        vk::PipelineStageFlagBits::eComputeShader,
                        { },
                        1,
                        &writeBarrier,
                        0,
                        nullptr,
                        0,
                        nullptr);
  else
  {
    // Use a memory barrier to make compute shader writes visible to the host for reading.
    auto const hostReadBarrier = vk::MemoryBarrier(
      vk::AccessFlagBits::eShaderWrite, // Source: Compute shader writes
      vk::AccessFlagBits::eHostRead     // Destination: Host reads
    );
    cmd.pipelineBarrier(
      vk::PipelineStageFlagBits::eComputeShader,
      vk::PipelineStageFlagBits::eHost,
      {}, 1, &hostReadBarrier, 0, nullptr, 0, nullptr);
    // Signal the event after the barrier
    cmd.setEvent(*object.sumFinishedEvent,
                 vk::PipelineStageFlagBits::eComputeShader);
  }
}

void AsyVkRender::resizeBlendShader(std::uint32_t maxDepth) {

  maxSize=ceilpow2(maxDepth);
  recreateBlendPipeline=true;
}

void AsyVkRender::resizeFragmentBuffer(FrameObject & object) {
  // Wait on the fence from the count+compute submission instead of polling an event.
  // The fence puts the OS thread to sleep (zero CPU waste), whereas waitForEvent()
  // busy-waits in a tight loop.
  vkutils::checkVkResult(device->waitForFences(
    1, &*object.inComputeFence, VK_TRUE, vkTimeout
  ));

  // Ensure we have the latest data from GPU
  feedbackMappedPtr->invalidate();
  const uint32_t *feedbackData = feedbackMappedPtr->getCopyPtr();
  std::uint32_t maxDepth = feedbackData[0];
  fragments = feedbackData[1];

  if(resetDepth) {
    maxSize=maxDepth=1;
    resetDepth=false;
  }

  if (maxDepth > maxSize) {
    resizeBlendShader(maxDepth);
  }

  if (fragments > maxFragments) {
    maxFragments=11*fragments/10;
    device->waitIdle();
    updateSceneDependentBuffers();
  }
}

void AsyVkRender::compressCount(FrameObject & object)
{
  auto push = buildPushConstants();
  currentCommandBuffer.bindPipeline(vk::PipelineBindPoint::eGraphics, *compressPipeline);
  currentCommandBuffer.pushConstants(*graphicsPipelineLayout, vk::ShaderStageFlagBits::eFragment, 0, sizeof(PushConstants), &push);
  currentCommandBuffer.draw(3, 1, 0, 0);
}

void AsyVkRender::refreshBuffers(FrameObject & object, int imageIndex) {
  std::vector<vk::CommandBuffer> commandsToSubmit {};

  // For Vulkan 1.3+ style submission
  std::vector<vk::CommandBufferSubmitInfo> cmdBufferInfos;

  beginFrameCommands(*object.countCommandBuffer);

  beginCountFrameRender(imageIndex);
  currentCommandBuffer.bindDescriptorSets(vk::PipelineBindPoint::eGraphics, *graphicsPipelineLayout, 0, 1, &*object.descriptorSet, 0, nullptr);

  if (!interlock) {
    drawBuffer(object.pointBuffers, &pointData,
               *pointPipelines[PIPELINE_COUNT]);
    drawBuffer(object.lineBuffers, &lineData,
               *linePipelines[PIPELINE_COUNT]);
    drawBuffer(object.materialBuffers, &materialData,
               *materialPipelines[PIPELINE_COUNT]);
    drawBuffer(object.colorBuffers, &colorData,
               *colorPipelines[PIPELINE_COUNT]);
    drawBuffer(object.triangleBuffers, &triangleData,
               *trianglePipelines[PIPELINE_COUNT]);
  }

  currentCommandBuffer.nextSubpass(vk::SubpassContents::eInline);

  // draw transparent
  drawBuffer(object.transparentBuffers, &transparentData,
             *transparentPipelines[PIPELINE_COUNT]);

  currentCommandBuffer.nextSubpass(vk::SubpassContents::eInline);

  if (GPUcompress) {
    std::uint32_t* p = elemBfMappedMem->getCopyPtr();
    p[0]=1;

    compressCount(object);
    endFrameRender();
    currentCommandBuffer.setEvent(*object.compressionFinishedEvent, vk::PipelineStageFlagBits::eFragmentShader);
    endFrameCommands();

    // Before submitting the count/compress pass, ensure any pending buffer
    // transfers (vertex/index data uploads) have completed.  The count pass
    // reads from those buffers, so they must be up to date.
    if (object.transferHasPendingWork) {
      object.copyCountCommandBuffer->end();

      auto transferSubmitInfo = vk::SubmitInfo(0, nullptr, nullptr,
        1, &*object.copyCountCommandBuffer, 0, nullptr);
      if (!object.transferFence)
        object.transferFence = device->createFenceUnique(vk::FenceCreateInfo());
      vkutils::checkVkResult(device->resetFences(1, &*object.transferFence));
      vkutils::checkVkResult(transferQueue.submit(1, &transferSubmitInfo, *object.transferFence));
      vkutils::checkVkResult(device->waitForFences(
        1, &*object.transferFence, VK_TRUE, vkTimeout));

      // Reset for reuse by the later endAndSubmitTransfers call.
      object.copyCountCommandBuffer->reset();
      object.copyCountCommandBuffer->begin(vk::CommandBufferBeginInfo());
      object.transferHasPendingWork = false;
    }

    // Create a fence for synchronization
    auto compressFence = device->createFenceUnique(vk::FenceCreateInfo());

    // Submit the command buffer with the fence
    auto submitInfo = vk::SubmitInfo();
    submitInfo.commandBufferCount = 1;
    submitInfo.pCommandBuffers = &currentCommandBuffer;

    vkutils::checkVkResult(renderQueue.submit(1, &submitInfo, *compressFence));

    // Wait for the fence with a reasonable timeout
    vkutils::checkVkResult(device->waitForFences(
      1, &*compressFence, VK_TRUE, vkTimeout
    ));

    // Invalidate cached host-visible memory before reading GPU results.
    elemBfMappedMem->invalidate();
    elements=p[0];
    p[0]=1;
  } else {
    endFrameRender();
    endFrameCommands();
    elements=pixels;
    cmdBufferInfos.push_back({*object.countCommandBuffer});
    commandsToSubmit.emplace_back(currentCommandBuffer);
  }

  if (elements == 0)
    return;

  beginFrameCommands(*object.computeCommandBuffer);
  g=ceilquotient(elements,groupSize);
  elements=groupSize*g;

  const unsigned int NSUMS=10000;

  if(settings::verbose >= timePartialSumVerbosity) {
    cerr << "Timing partial sums:" << endl;
    device->resetEvent(*object.startTimedSumsEvent);
    device->resetEvent(*object.timedSumsFinishedEvent);
    // Start recording commands into partialSumsCommandBuffer
    object.partialSumsCommandBuffer->begin(vk::CommandBufferBeginInfo());

    // Wait to execute the compute shaders until we trigger them from CPU
    object.partialSumsCommandBuffer->waitEvents(
      1,
      &*object.startTimedSumsEvent,
      vk::PipelineStageFlagBits::eHost,
      vk::PipelineStageFlagBits::eComputeShader,
      0,
      nullptr,
      0,
      nullptr,
      0,
      nullptr
      );

    // Record all partial sums calcs into partialSumsCommandBuffer
    for(unsigned int i=0; i < NSUMS; ++i)
      partialSums(object,true);

    // Signal to the CPU once the compute shaders have executed
    object.partialSumsCommandBuffer->setEvent(*object.timedSumsFinishedEvent, vk::PipelineStageFlagBits::eComputeShader);
    object.partialSumsCommandBuffer->end();
  }

  partialSums(object);
  endFrameCommands();
  cmdBufferInfos.push_back({*object.computeCommandBuffer});
  commandsToSubmit.emplace_back(currentCommandBuffer);

  // This submission is for the transparency pre-computation (count + partial sums).
  // It MUST be synchronized with a fence because the CPU needs to read back the results
  // in resizeFragmentBuffer before the main graphics pass can be recorded.

  // Check if we have pending transfers before ending them
  bool hasPendingTransfers = object.transferHasPendingWork;

  // End and submit any pending buffer transfers before the count pass
  endAndSubmitTransfers(object, transferQueue);

  // Use timeline semaphore for more efficient synchronization
  currentTimelineValue++;
  object.computeTimelineValue = currentTimelineValue;

  vk::TimelineSemaphoreSubmitInfo timelineInfo;
  std::vector<uint64_t> signalValues = {object.computeTimelineValue};
  timelineInfo.signalSemaphoreValueCount = signalValues.size();
  timelineInfo.pSignalSemaphoreValues = signalValues.data();

  // Reset the fence before submission
  (void) device->resetFences(1, &*object.inComputeFence);

  // Wait for transfers to complete before executing count/compute passes
  std::vector<vk::SemaphoreSubmitInfo> waitSemInfos;
  if (hasPendingTransfers) {
    waitSemInfos.push_back({*object.transferDoneSemaphore, 0, vk::PipelineStageFlagBits2::eAllCommands});
  }

  std::vector<vk::SemaphoreSubmitInfo> signalSemInfos;
  signalSemInfos.push_back({*renderTimelineSemaphore, object.computeTimelineValue, vk::PipelineStageFlagBits2::eAllCommands});

  vk::SubmitInfo2 submitInfo2({}, waitSemInfos, cmdBufferInfos, signalSemInfos);
  vkutils::checkVkResult(renderQueue.submit2(1, &submitInfo2, *object.inComputeFence));

  if(settings::verbose >= timePartialSumVerbosity) {
    // Wait until the render queue isn't being used, so we only time
    // our partial sums calculation
    renderQueue.waitIdle();

    auto partialSumsInfo = vk::SubmitInfo();

    partialSumsInfo.commandBufferCount = 1;
    partialSumsInfo.pCommandBuffers = &*object.partialSumsCommandBuffer;

    // Signal GPU to start partial sums
    device->setEvent(*object.startTimedSumsEvent);

    // Send all the partial sum commands to the GPU.
    vkutils::checkVkResult(renderQueue.submit(1, &partialSumsInfo, nullptr));

    // Start recording the time
    utils::stopWatch Timer;

    // Wait until the GPU tells us the sums are finished
    waitForEvent(*object.timedSumsFinishedEvent);

    // End recording
    double T=Timer.seconds()/NSUMS;
    cout << "elements=" << elements << endl;
    cout << "T (ms)=" << T*1e3 << endl;
    cout << "Megapixels/second=" << elements/T/1e6 << endl;
  }
}

void AsyVkRender::blendFrame(int imageIndex)
{
  auto push = buildPushConstants();
  currentCommandBuffer.bindPipeline(
    vk::PipelineBindPoint::eGraphics,
    *blendPipeline
  );
  currentCommandBuffer.pushConstants(*graphicsPipelineLayout, vk::ShaderStageFlagBits::eFragment, 0, sizeof(PushConstants), &push);
  currentCommandBuffer.draw(3, 1, 0, 0);
}

void AsyVkRender::preDrawBuffers(FrameObject & object, int imageIndex)
{
  copied=false;

  if(!Opaque) {
    // Avoid blocking CPU wait when possible
    if (object.computeTimelineValue > 0) {
      // Use timeline semaphore for more efficient synchronization
      waitForTimelineSemaphore(*renderTimelineSemaphore, object.computeTimelineValue, vkTimeout);
    }

    vkutils::checkVkResult(device->resetFences(
      1, &*object.inComputeFence
    ));
    device->resetEvent(*object.sumFinishedEvent);
    device->resetEvent(*object.compressionFinishedEvent);

    object.countCommandBuffer->reset();
    object.computeCommandBuffer->reset();

    // Begin recording buffer transfers for the count pass
    beginTransferRecording(object);

    refreshBuffers(object, imageIndex);
    resizeFragmentBuffer(object);
    if (!interlock)
      copied=true;
  }
}

void AsyVkRender::drawBuffers(FrameObject & object, int imageIndex)
{
  beginGraphicsFrameRender(imageIndex);
  currentCommandBuffer.bindDescriptorSets(vk::PipelineBindPoint::eGraphics, *graphicsPipelineLayout, 0, 1, &*object.descriptorSet, 0, nullptr);
  drawPoints(object);
  drawLines(object);
  drawMaterials(object);
  drawColors(object);
  drawTriangles(object);

  if(!Opaque) {
    currentCommandBuffer.nextSubpass(vk::SubpassContents::eInline);
    drawTransparent(object);
    currentCommandBuffer.nextSubpass(vk::SubpassContents::eInline);
    blendFrame(imageIndex);
  }

  endFrameRender();
}

void AsyVkRender::postProcessImage(vk::CommandBuffer& cmdBuffer, uint32_t const& frameIndex)
{
  if (frameIndex >= postProcessDescSet.size() ||
      !postProcessDescSet[frameIndex])
    runtimeError("Invalid post-process descriptor set");

  cmdBuffer.bindPipeline(vk::PipelineBindPoint::eCompute, *postProcessPipeline);

  std::vector const computeDescSet{*postProcessDescSet[frameIndex]};
  cmdBuffer.bindDescriptorSets(
          vk::PipelineBindPoint::eCompute,
          *postProcessPipelineLayout,
          0,
          VEC_VIEW(computeDescSet),
          EMPTY_VIEW
  );
  cmdBuffer.dispatch(postProcessThreadGroupCount.width, postProcessThreadGroupCount.height, 1);
}

void AsyVkRender::copyToSwapchainImg(vk::CommandBuffer& cmdBuffer, uint32_t const& frameIndex)
{
  // Formats differ (pre-presentation RGBA8 -> swapchain BGRA8), use blit
  vk::ImageBlit blit{};
  blit.srcSubresource = {vk::ImageAspectFlagBits::eColor, 0, 0, 1};
  blit.srcOffsets[0] = vk::Offset3D(0, 0, 0);
  blit.srcOffsets[1] =  vk::Offset3D(
    static_cast<int32_t>(backbufferExtent.width), static_cast<int32_t>(backbufferExtent.height), 1);
  blit.dstSubresource = {vk::ImageAspectFlagBits::eColor, 0, 0, 1};
  blit.dstOffsets[0] = vk::Offset3D(0, 0, 0);
  blit.dstOffsets[1] =  vk::Offset3D(
    static_cast<int32_t>(backbufferExtent.width), static_cast<int32_t>(backbufferExtent.height), 1);

  vk::ImageLayout oldLayout = View ? vk::ImageLayout::ePresentSrcKHR : vk::ImageLayout::eColorAttachmentOptimal;

  transitionImageLayout(
    cmdBuffer,
    backbufferImages[frameIndex],
    vk::AccessFlagBits::eNone,
    vk::AccessFlagBits::eTransferWrite,
    oldLayout,
    vk::ImageLayout::eTransferDstOptimal,
    vk::PipelineStageFlagBits::eTransfer,
    vk::PipelineStageFlagBits::eTransfer,
    vk::ImageSubresourceRange(vk::ImageAspectFlagBits::eColor, 0, 1, 0, 1)
  );

  cmdBuffer.blitImage(
    prePresentationImages[frameIndex].getImage(),
    vk::ImageLayout::eTransferSrcOptimal,
    backbufferImages[frameIndex],
    vk::ImageLayout::eTransferDstOptimal,
    1, &blit, vk::Filter::eNearest
    );

  transitionImageLayout(
    cmdBuffer,
    backbufferImages[frameIndex],
    vk::AccessFlagBits::eTransferWrite,
    vk::AccessFlagBits::eMemoryRead,
    vk::ImageLayout::eTransferDstOptimal,
    oldLayout,
    vk::PipelineStageFlagBits::eTransfer,
    vk::PipelineStageFlagBits::eTransfer,
    vk::ImageSubresourceRange(vk::ImageAspectFlagBits::eColor, 0, 1, 0, 1)
  );
}

void AsyVkRender::drawFrame()
{
  auto& frameObject = frameObjects[currentFrame];

  // Wait only if we are about to reuse a frame that is still in use by the GPU.
  // We check if the timeline value for this specific frame has been reached.
  if (frameObject.timelineValue > 0) {
    waitForTimelineSemaphore(*renderTimelineSemaphore, frameObject.timelineValue, vkTimeout);
  }

  if (View && !swapChain) {
    initializeSwapChainIfNeeded();
  }

  // Detect srgb setting changes and recreate pipelines accordingly
  bool newSrgb = settings::getSetting<bool>("srgb");
  if (newSrgb != srgb) {
    srgb = newSrgb;
    recreatePipeline = true;
  }

  if (recreatePipeline)
  {
    device->waitIdle();
    recreatePipeline = false;
    createGraphicsPipelines();
  }

  uint32_t imageIndex = 0;
  if (View) {
    auto const result = device->acquireNextImageKHR(*swapChain, vkTimeout, *frameObject.imageAvailableSemaphore, nullptr, &imageIndex);
    if (result == vk::Result::eErrorOutOfDateKHR || result == vk::Result::eSuboptimalKHR || framebufferResized) {
      framebufferResized = false;
      recreateSwapChain();
      return;
    }
    else if (result == vk::Result::eErrorOutOfDeviceMemory) {
      outOfMemory();
    }
    else if (result != vk::Result::eSuccess && result != vk::Result::eSuboptimalKHR) {
      std::stringstream buf;
      buf << "Error: Failed to acquire swapchain image: " << vk::to_string(result) << std::endl;
      runtimeError(buf.str());
    }
  }

  frameObject.commandBuffer->reset(vk::CommandBufferResetFlagBits());

  updateUniformBuffer(currentFrame);
  updateBuffers();

  try {
    preDrawBuffers(frameObject, imageIndex);
  } catch (const vk::OutOfDeviceMemoryError& e) {
    outOfMemory();
  }

  // Begin recording buffer transfers for the main render pass
  beginTransferRecording(frameObject);

  beginFrameCommands(getFrameCommandBuffer());
  drawBuffers(frameObject, imageIndex);
  if (fxaa) {
    auto& cmdBuffer = *frameObject.commandBuffer;

    // Run FXAA compute shader
    postProcessImage(cmdBuffer, imageIndex);

    // Prepare for presentation
    transitionImageLayout(
      cmdBuffer,
      prePresentationImages[imageIndex].getImage(),
      vk::AccessFlagBits::eShaderWrite,
      vk::AccessFlagBits::eTransferRead,
      vk::ImageLayout::eGeneral,
      vk::ImageLayout::eTransferSrcOptimal,
      vk::PipelineStageFlagBits::eComputeShader,
      vk::PipelineStageFlagBits::eTransfer,
      vk::ImageSubresourceRange(vk::ImageAspectFlagBits::eColor, 0, 1, 0, 1)
    );

    copyToSwapchainImg(cmdBuffer, imageIndex);

    transitionImageLayout(
        cmdBuffer,
        prePresentationImages[imageIndex].getImage(),
        vk::AccessFlagBits::eTransferRead,
        vk::AccessFlagBits::eShaderWrite,
        vk::ImageLayout::eTransferSrcOptimal,
        vk::ImageLayout::eGeneral,
        vk::PipelineStageFlagBits::eTransfer,
        vk::PipelineStageFlagBits::eComputeShader,
        vk::ImageSubresourceRange(vk::ImageAspectFlagBits::eColor, 0, 1, 0, 1)
    );
  }
  endFrameCommands();

  // End and submit any pending buffer transfers before the main render pass
  bool hasPendingTransfers = frameObject.transferHasPendingWork;
  endAndSubmitTransfers(frameObject, transferQueue);

  std::vector<vk::Semaphore> waitSems;
  std::vector<uint64_t> waitSemaphoreValues;
  // For Vulkan 1.3+ style submission
  std::vector<vk::SemaphoreSubmitInfo> waitSemInfos;
  std::vector<vk::SemaphoreSubmitInfo> signalSemInfos;
  std::vector<vk::PipelineStageFlags> waitStages;
  if (View) {
      waitSems.push_back(*frameObject.imageAvailableSemaphore);
      waitStages.push_back(vk::PipelineStageFlagBits::eColorAttachmentOutput);
  }

  // Wait for buffer transfers to complete before rendering
  if (hasPendingTransfers) {
      waitSems.push_back(*frameObject.transferDoneSemaphore);
      waitStages.push_back(vk::PipelineStageFlagBits::eAllCommands);
  }

  std::vector<vk::Semaphore> signalSems;

  if (View) {
      if (imageIndex >= renderFinishedSemaphore.size())
          renderFinishedSemaphore.push_back(device->createSemaphoreUnique(vk::SemaphoreCreateInfo()));
      signalSemInfos.push_back({*renderFinishedSemaphore[imageIndex], 0, vk::PipelineStageFlagBits2::eAllCommands});
      signalSems.push_back(*renderFinishedSemaphore[imageIndex]);
  }

  vk::SubmitInfo submitInfo;
  submitInfo.pWaitSemaphores = waitSems.data();
  submitInfo.waitSemaphoreCount = waitSems.size();
  submitInfo.pWaitDstStageMask = waitStages.data();
  submitInfo.pCommandBuffers = &*frameObject.commandBuffer;
  submitInfo.commandBufferCount = 1;

  vk::TimelineSemaphoreSubmitInfo timelineInfo;
  std::vector<uint64_t> signalValues;

  if (!waitSems.empty()) {
    // Add wait values for binary semaphores (0)
    waitSemaphoreValues.resize(waitSems.size(), 0);
    timelineInfo.waitSemaphoreValueCount = waitSemaphoreValues.size();
    timelineInfo.pWaitSemaphoreValues = waitSemaphoreValues.data();
  }

  currentTimelineValue++;
  frameObject.timelineValue = currentTimelineValue;

  signalSemInfos.push_back({*renderTimelineSemaphore, frameObject.timelineValue, vk::PipelineStageFlagBits2::eAllCommands});
  signalSems.push_back(*renderTimelineSemaphore);

  // The value for the binary semaphore is ignored, but the count must match.
  if (View) {
      signalValues.push_back(0);
  }
  signalValues.push_back(frameObject.timelineValue);

  timelineInfo.signalSemaphoreValueCount = signalValues.size();
  timelineInfo.pSignalSemaphoreValues = signalValues.data();
  submitInfo.pNext = &timelineInfo;

  submitInfo.pSignalSemaphores = signalSems.data();
  submitInfo.signalSemaphoreCount = signalSems.size();

  // Reset the fence before submission
  (void) device->resetFences(1, &*frameObject.inFlightFence);

  try {
    vkutils::checkVkResult(renderQueue.submit(1, &submitInfo, nullptr));
  } catch (const vk::OutOfDeviceMemoryError& e) {
    outOfMemory();
  }

  if (View) {
    // The presentation engine only needs to wait on the binary semaphore.
    std::vector<vk::Semaphore> presentWaitSemaphores;
    presentWaitSemaphores.push_back(*renderFinishedSemaphore[imageIndex]);

    try {
      auto presentInfo = vk::PresentInfoKHR(VEC_VIEW(presentWaitSemaphores), 1, &*swapChain, &imageIndex);
      auto const result = presentQueue.presentKHR(presentInfo);

      if (result == vk::Result::eErrorOutOfDateKHR || result == vk::Result::eSuboptimalKHR || framebufferResized) {
        framebufferResized = false;
        recreateSwapChain();
      } else if (result != vk::Result::eSuccess) {
        if (result == vk::Result::eErrorOutOfDeviceMemory) {
          outOfMemory();
        } else {
          runtimeError("failed to present swapchain image: " + vk::to_string(result));
        }
      }
    } catch (std::exception const & e) {
      auto what=std::string(e.what());
      if (what.find("ErrorOutOfDateKHR") != std::string::npos) {
        framebufferResized = false;
        recreateSwapChain();
      } else {
        if (what.find("OutOfDeviceMemory") != std::string::npos) {
          outOfMemory();
        } else {
          runtimeError(what);
        }
      }
    }
  }

  if(queueExport) {
    // Wait for the just-submitted frame to finish before exporting
    waitForTimelineSemaphore(*renderTimelineSemaphore, frameObject.timelineValue);
    Export(imageIndex);
    queueExport=false;
  }

  if (recreateBlendPipeline) {
    waitForTimelineSemaphore(*renderTimelineSemaphore, frameObject.timelineValue);
    createBlendPipeline();
    recreateBlendPipeline=false;
  }

  currentFrame = (currentFrame + 1) % maxFramesInFlight;
}

/**
 * Swap front and back buffers (Vulkan-specific implementation).
 */
void AsyVkRender::swapBuffers()
{
  // Vulkan buffer swap is handled in drawFrame() via present
}

GLFWwindow* AsyVkRender::getRenderWindow() const
{
  return glfwWindow;
}

void AsyVkRender::exportHandler(int) {
  readyAfterExport=true;
  Export(0);
}

void AsyVkRender::Export(int imageIndex) {
  exportCommandBuffer->reset();

  vkutils::checkVkResult(device->resetFences(1, &*exportFence));

  exportCommandBuffer->begin(vk::CommandBufferBeginInfo(vk::CommandBufferUsageFlagBits::eOneTimeSubmit));

  auto const size = device->getImageMemoryRequirements(backbufferImages[0]).size;
  auto const swapExtent = vk::Extent3D(
    backbufferExtent.width,
    backbufferExtent.height,
    1
  );
  auto const reg = vk::BufferImageCopy(
    0,
    backbufferExtent.width,
    backbufferExtent.height,
    vk::ImageSubresourceLayers(
      vk::ImageAspectFlagBits::eColor, 0, 0, 1
    ),
    { },
    swapExtent
  );

  vma::cxx::UniqueBuffer exportBuf = createBufferUnique(
    vk::BufferUsageFlagBits::eTransferDst,
    VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
    size,
    VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT);

  transitionImageLayout(
    *exportCommandBuffer,
    backbufferImages[imageIndex],
    vk::AccessFlagBits::eMemoryRead,
    vk::AccessFlagBits::eTransferRead,
    !View ? vk::ImageLayout::eColorAttachmentOptimal : vk::ImageLayout::ePresentSrcKHR,
    vk::ImageLayout::eTransferSrcOptimal,
    vk::PipelineStageFlagBits::eTransfer,
    vk::PipelineStageFlagBits::eTransfer,
    vk::ImageSubresourceRange(
      vk::ImageAspectFlagBits::eColor,
      0,
      1,
      0,
      1
    )
  );

  exportCommandBuffer->copyImageToBuffer(backbufferImages[imageIndex], vk::ImageLayout::eTransferSrcOptimal, exportBuf.getBuffer(), 1, &reg);

  transitionImageLayout(
    *exportCommandBuffer,
    backbufferImages[imageIndex],
    vk::AccessFlagBits::eTransferRead,
    vk::AccessFlagBits::eMemoryRead,
    vk::ImageLayout::eTransferSrcOptimal,
    !View ? vk::ImageLayout::eColorAttachmentOptimal : vk::ImageLayout::ePresentSrcKHR,
    vk::PipelineStageFlagBits::eTransfer,
    vk::PipelineStageFlagBits::eTransfer,
    vk::ImageSubresourceRange(
      vk::ImageAspectFlagBits::eColor,
      0,
      1,
      0,
      1
    )
  );

  exportCommandBuffer->end();

  auto const submitInfo = vk::SubmitInfo(
    0, nullptr, nullptr,
    1, &*exportCommandBuffer,
    0, nullptr
  );

  if (renderQueue.submit(1, &submitInfo, *exportFence) != vk::Result::eSuccess)
    runtimeError("failed to submit draw command buffer");

  vkutils::checkVkResult(device->waitForFences(
    1, &*exportFence, VK_TRUE, vkTimeout
  ));

  vma::cxx::MemoryMapperLock mappedMemory(exportBuf);

  auto * fmt = new unsigned char[backbufferExtent.width * backbufferExtent.height * 3]; // 3 for RGB

  auto data=mappedMemory.getCopyPtr<unsigned char>();
  for (auto i = 0u; i < backbufferExtent.height; i++)
    for (auto j = 0u; j < backbufferExtent.width; j++)
      for (auto k = 0u; k < 3; k++)
        // need to flip vertically and swap byte order due to little endian in image data
        // 4 for sizeof unsigned (RGBA)
        fmt[(backbufferExtent.height-1-i)*backbufferExtent.width*3+j*3+(2-k)]=data[i*backbufferExtent.width*4+j*4+k];

  picture pic;
  double w=oWidth;
  double h=oHeight;
  double Aspect=((double) backbufferExtent.width)/backbufferExtent.height;
  if(w > h*Aspect) w=(int) (h*Aspect+0.5);
  else h=(int) (w/Aspect+0.5);

  if(settings::verbose > 1)
    cout << "Exporting " << Prefix << " as " << backbufferExtent.width << "x"
         << backbufferExtent.height << " image" << endl;

  auto * const Image=new camp::drawRawImage(fmt,
                                            backbufferExtent.width,
                                            backbufferExtent.height,
                                            transform(0.0,0.0,w,0.0,0.0,h),
                                            antialias);
  pic.append(Image);
  pic.shipout(NULL,Prefix,Format,false,ViewExport);
  delete Image;
  delete[] fmt;
  queueExport=false;
  setProjection();
  remesh=true;
  redraw=true;

#ifdef HAVE_PTHREAD
  if(threads && readyAfterExport) {
    readyAfterExport=false;
    threadMgr.endwait(threadMgr.readySignal,threadMgr.readyLock);
  }
#endif
}

void AsyVkRender::finalizeProcess()
{
  glslang::FinalizeProcess();
}

void AsyVkRender::reshape(int width, int height) {
  AsyRender::reshape(width, height);
  framebufferResized = true;
}

void AsyVkRender::cycleMode() {
  // Wait for GPU to finish (Vulkan-specific)
  if (device) {
    device->waitIdle();
  }

  // Use base class implementation for mode cycling
  AsyRender::cycleMode();

  // Vulkan-specific: update uniform buffer and pipeline flags
  newUniformBuffer = true;
  recreatePipeline = true;
}

} // namespace camp

#endif // HAVE_VULKAN
