From patchwork Thu Feb 21 03:57:52 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Philip Langdale X-Patchwork-Id: 12122 Return-Path: X-Original-To: patchwork@ffaux-bg.ffmpeg.org Delivered-To: patchwork@ffaux-bg.ffmpeg.org Received: from ffbox0-bg.mplayerhq.hu (ffbox0-bg.ffmpeg.org [79.124.17.100]) by ffaux.localdomain (Postfix) with ESMTP id 3C4904497CC for ; Thu, 21 Feb 2019 05:58:18 +0200 (EET) Received: from [127.0.1.1] (localhost [127.0.0.1]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTP id 1F76B68ACF4; Thu, 21 Feb 2019 05:58:18 +0200 (EET) X-Original-To: ffmpeg-devel@ffmpeg.org Delivered-To: ffmpeg-devel@ffmpeg.org Received: from mail-yw1-f100.google.com (mail-yw1-f100.google.com [209.85.161.100]) by ffbox0-bg.mplayerhq.hu (Postfix) with ESMTPS id 9B68468A92C for ; Thu, 21 Feb 2019 05:58:07 +0200 (EET) Received: by mail-yw1-f100.google.com with SMTP id o184so10125281ywo.5 for ; Wed, 20 Feb 2019 19:58:07 -0800 (PST) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:dkim-signature:from:to:cc:subject:date :message-id:in-reply-to:references:mime-version :content-transfer-encoding; bh=Y4bu3C6k5N6IoTNpsIttYM58Xv37I/RPk3o22YxnJvY=; b=OmCitLVZw/nI0rrrQSPSfPUX+Iu0Y8fekNF2IQLmzamIT+4YtPatahhwuD1H+pjH5I RzQJMthFR+MWd9J1ZBQZgC2myUd8YAka3zRkWzZNHtT5AtwW0DpQORckpRYMVWXYWqyt Ug4PO2qVAKUpaZ1j118mRFmyXKWU8Yh4yp1LBofF1d6F8YpB/QbWQnhElvVXEfMVmrsl /x+d3VjayTDgZBWItgTD9NaHu8EVlKRO8TlAj9rDTAZwQLbwp8LlqiymuZM/P+OsdyuF 9CBJPuxIpgORxSE2x9z9ISuFUY1HYihQJbRvYZTWaimn3A+wl8y0AMp1B8CIiZXzyAtv plcA== X-Gm-Message-State: AHQUAuaXd3rxcv5BGKIn7B89Pwa2zbrcv0rfNk66sZCND1K1LiswAnh8 zWYVZ0n95iwRzK4BjVlekF3sU9lfpm3BFETCfR1bqdQd+/uHqQ== X-Google-Smtp-Source: AHgI3IaIIiYvYmSZO1RrX0kq2UmegJIu64g7WMK1uTMn9VSh9z791LuG+8Awh7CtA8LdgQZuATnz8ux5/+TK X-Received: by 2002:a81:5c07:: with SMTP id q7mr31224322ywb.149.1550721486147; Wed, 20 Feb 2019 19:58:06 -0800 (PST) Received: from mail.overt.org (155.208.178.107.bc.googleusercontent.com. [107.178.208.155]) by smtp-relay.gmail.com with ESMTPS id 14sm669474ywv.6.2019.02.20.19.58.06 for (version=TLS1_2 cipher=ECDHE-RSA-CHACHA20-POLY1305 bits=256/256); Wed, 20 Feb 2019 19:58:06 -0800 (PST) X-Relaying-Domain: gapps.overt.org Received: from authenticated-user (mail.overt.org [107.178.208.155]) (using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits)) (No client certificate requested) by mail.overt.org (Postfix) with ESMTPSA id EBFFE3F07F; Wed, 20 Feb 2019 21:58:04 -0600 (CST) DKIM-Signature: v=1; a=rsa-sha256; c=simple/simple; d=overt.org; s=mail; t=1550721485; bh=MyFbemU1xSCttv36bqJwdIHNWib1X1J6fIVv/AnS58A=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=a/eUlnE1SbqfHvvZdNGg9iSRHeZbKoXeD67on6UWho3kGTylW7YGEeyKdaWIdn+7u VAfmR/WSYhpBRsyv4UZ+D1S+KVp40k0nWYxTxAObn7tMxL2RWTejhrbwJSPJplN0vO JuJ13PawB7IOuLJZQAlIeauzerRovqckOIB83zAwfGIMHuB8SH+GrmDxBefbiS3Xrl 6SNI8nMpTC3L4aUk0dVockeXwx9rG1pDotiS2cnJVtXaqh++OcUINfyrR0CJqANMpG ZamwaDMRuel7sm3vUGsE2if1qeM157j1Gidd5kBDhNv+uZ/Fm/QFAmXZ0ZTyOeb9vY trYMRxEOuW0rw== From: Philip Langdale To: ffmpeg-devel@ffmpeg.org Date: Wed, 20 Feb 2019 19:57:52 -0800 Message-Id: <20190221035753.27525-5-philipl@overt.org> In-Reply-To: <20190221035753.27525-1-philipl@overt.org> References: <20190221035753.27525-1-philipl@overt.org> MIME-Version: 1.0 Subject: [FFmpeg-devel] [PATCH 4/5] avfilter/vf_thumbnail_cuda: Switch to using ffnvcodec X-BeenThere: ffmpeg-devel@ffmpeg.org X-Mailman-Version: 2.1.20 Precedence: list List-Id: FFmpeg development discussions and patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Reply-To: FFmpeg development discussions and patches Cc: Philip Langdale Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" This change switches the vf_thumbnail_cuda filter from using the full cuda sdk to using the ffnvcodec headers and loader. Most of the change is a direct mapping, but I also switched from using texture references to using texture objects. This is supposed to be the preferred way of using textures, and the texture object API is the one I added to ffnvcodec. Signed-off-by: Philip Langdale --- configure | 2 +- libavfilter/vf_thumbnail_cuda.c | 147 +++++++++++++++++-------------- libavfilter/vf_thumbnail_cuda.cu | 25 +++--- 3 files changed, 93 insertions(+), 81 deletions(-) diff --git a/configure b/configure index 57098149f9..31576350bd 100755 --- a/configure +++ b/configure @@ -2967,7 +2967,7 @@ v4l2_m2m_deps="linux_videodev2_h sem_timedwait" hwupload_cuda_filter_deps="ffnvcodec" scale_npp_filter_deps="ffnvcodec libnpp" scale_cuda_filter_deps="ffnvcodec cuda_nvcc" -thumbnail_cuda_filter_deps="cuda_sdk" +thumbnail_cuda_filter_deps="ffnvcodec cuda_nvcc" transpose_npp_filter_deps="ffnvcodec libnpp" amf_deps_any="libdl LoadLibrary" diff --git a/libavfilter/vf_thumbnail_cuda.c b/libavfilter/vf_thumbnail_cuda.c index 22691e156f..0c06815643 100644 --- a/libavfilter/vf_thumbnail_cuda.c +++ b/libavfilter/vf_thumbnail_cuda.c @@ -20,10 +20,8 @@ * DEALINGS IN THE SOFTWARE. */ -#include - #include "libavutil/hwcontext.h" -#include "libavutil/hwcontext_cuda.h" +#include "libavutil/hwcontext_cuda_internal.h" #include "libavutil/cuda_check.h" #include "libavutil/opt.h" #include "libavutil/pixdesc.h" @@ -31,7 +29,7 @@ #include "avfilter.h" #include "internal.h" -#define CHECK_CU(x) FF_CUDA_CHECK(ctx, x) +#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x) #define HIST_SIZE (3*256) #define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) ) @@ -60,6 +58,7 @@ typedef struct ThumbnailCudaContext { AVRational tb; ///< copy of the input timebase to ease access AVBufferRef *hw_frames_ctx; + AVCUDADeviceContext *hwctx; CUmodule cu_module; @@ -67,12 +66,10 @@ typedef struct ThumbnailCudaContext { CUfunction cu_func_uchar2; CUfunction cu_func_ushort; CUfunction cu_func_ushort2; - CUtexref cu_tex_uchar; - CUtexref cu_tex_uchar2; - CUtexref cu_tex_ushort; - CUtexref cu_tex_ushort2; + CUstream cu_stream; CUdeviceptr data; + } ThumbnailCudaContext; #define OFFSET(x) offsetof(ThumbnailCudaContext, x) @@ -157,29 +154,44 @@ static AVFrame *get_best_frame(AVFilterContext *ctx) return picref; } -static int thumbnail_kernel(ThumbnailCudaContext *ctx, CUfunction func, CUtexref tex, int channels, +static int thumbnail_kernel(AVFilterContext *ctx, CUfunction func, int channels, int *histogram, uint8_t *src_dptr, int src_width, int src_height, int src_pitch, int pixel_size) { - CUdeviceptr src_devptr = (CUdeviceptr)src_dptr; - void *args[] = { &histogram, &src_width, &src_height }; - CUDA_ARRAY_DESCRIPTOR desc; - - desc.Width = src_width; - desc.Height = src_height; - desc.NumChannels = channels; - if (pixel_size == 1) { - desc.Format = CU_AD_FORMAT_UNSIGNED_INT8; - } - else { - desc.Format = CU_AD_FORMAT_UNSIGNED_INT16; - } + int ret; + ThumbnailCudaContext *s = ctx->priv; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; + CUtexObject tex = 0; + void *args[] = { &tex, &histogram, &src_width, &src_height }; - CHECK_CU(cuTexRefSetAddress2D_v3(tex, &desc, src_devptr, src_pitch)); - CHECK_CU(cuLaunchKernel(func, - DIV_UP(src_width, BLOCKX), DIV_UP(src_height, BLOCKY), 1, - BLOCKX, BLOCKY, 1, 0, 0, args, NULL)); + CUDA_TEXTURE_DESC tex_desc = { + .filterMode = CU_TR_FILTER_MODE_LINEAR, + .flags = CU_TRSF_READ_AS_INTEGER, + }; - return 0; + CUDA_RESOURCE_DESC res_desc = { + .resType = CU_RESOURCE_TYPE_PITCH2D, + .res.pitch2D.format = pixel_size == 1 ? + CU_AD_FORMAT_UNSIGNED_INT8 : + CU_AD_FORMAT_UNSIGNED_INT16, + .res.pitch2D.numChannels = channels, + .res.pitch2D.width = src_width, + .res.pitch2D.height = src_height, + .res.pitch2D.pitchInBytes = src_pitch, + .res.pitch2D.devPtr = (CUdeviceptr)src_dptr, + }; + + ret = CHECK_CU(cu->cuTexObjectCreate(&tex, &res_desc, &tex_desc, NULL)); + if (ret < 0) + goto exit; + + ret = CHECK_CU(cu->cuLaunchKernel(func, + DIV_UP(src_width, BLOCKX), DIV_UP(src_height, BLOCKY), 1, + BLOCKX, BLOCKY, 1, 0, s->cu_stream, args, NULL)); +exit: + if (tex) + CHECK_CU(cu->cuTexObjectDestroy(tex)); + + return ret; } static int thumbnail(AVFilterContext *ctx, int *histogram, AVFrame *in) @@ -189,40 +201,40 @@ static int thumbnail(AVFilterContext *ctx, int *histogram, AVFrame *in) switch (in_frames_ctx->sw_format) { case AV_PIX_FMT_NV12: - thumbnail_kernel(s, s->cu_func_uchar, s->cu_tex_uchar, 1, + thumbnail_kernel(ctx, s->cu_func_uchar, 1, histogram, in->data[0], in->width, in->height, in->linesize[0], 1); - thumbnail_kernel(s, s->cu_func_uchar2, s->cu_tex_uchar2, 2, + thumbnail_kernel(ctx, s->cu_func_uchar2, 2, histogram + 256, in->data[1], in->width / 2, in->height / 2, in->linesize[1], 1); break; case AV_PIX_FMT_YUV420P: - thumbnail_kernel(s, s->cu_func_uchar, s->cu_tex_uchar, 1, + thumbnail_kernel(ctx, s->cu_func_uchar, 1, histogram, in->data[0], in->width, in->height, in->linesize[0], 1); - thumbnail_kernel(s, s->cu_func_uchar, s->cu_tex_uchar, 1, + thumbnail_kernel(ctx, s->cu_func_uchar, 1, histogram + 256, in->data[1], in->width / 2, in->height / 2, in->linesize[1], 1); - thumbnail_kernel(s, s->cu_func_uchar, s->cu_tex_uchar, 1, + thumbnail_kernel(ctx, s->cu_func_uchar, 1, histogram + 512, in->data[2], in->width / 2, in->height / 2, in->linesize[2], 1); break; case AV_PIX_FMT_YUV444P: - thumbnail_kernel(s, s->cu_func_uchar, s->cu_tex_uchar, 1, + thumbnail_kernel(ctx, s->cu_func_uchar, 1, histogram, in->data[0], in->width, in->height, in->linesize[0], 1); - thumbnail_kernel(s, s->cu_func_uchar, s->cu_tex_uchar, 1, + thumbnail_kernel(ctx, s->cu_func_uchar, 1, histogram + 256, in->data[1], in->width, in->height, in->linesize[1], 1); - thumbnail_kernel(s, s->cu_func_uchar, s->cu_tex_uchar, 1, + thumbnail_kernel(ctx, s->cu_func_uchar, 1, histogram + 512, in->data[2], in->width, in->height, in->linesize[2], 1); break; case AV_PIX_FMT_P010LE: case AV_PIX_FMT_P016LE: - thumbnail_kernel(s, s->cu_func_ushort, s->cu_tex_ushort, 1, + thumbnail_kernel(ctx, s->cu_func_ushort, 1, histogram, in->data[0], in->width, in->height, in->linesize[0], 2); - thumbnail_kernel(s, s->cu_func_ushort2, s->cu_tex_ushort2, 2, + thumbnail_kernel(ctx, s->cu_func_ushort2, 2, histogram + 256, in->data[1], in->width / 2, in->height / 2, in->linesize[1], 2); break; case AV_PIX_FMT_YUV444P16: - thumbnail_kernel(s, s->cu_func_ushort2, s->cu_tex_uchar, 1, + thumbnail_kernel(ctx, s->cu_func_ushort2, 1, histogram, in->data[0], in->width, in->height, in->linesize[0], 2); - thumbnail_kernel(s, s->cu_func_ushort2, s->cu_tex_uchar, 1, + thumbnail_kernel(ctx, s->cu_func_ushort2, 1, histogram + 256, in->data[1], in->width, in->height, in->linesize[1], 2); - thumbnail_kernel(s, s->cu_func_ushort2, s->cu_tex_uchar, 1, + thumbnail_kernel(ctx, s->cu_func_ushort2, 1, histogram + 512, in->data[2], in->width, in->height, in->linesize[2], 2); break; default: @@ -236,10 +248,10 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *frame) { AVFilterContext *ctx = inlink->dst; ThumbnailCudaContext *s = ctx->priv; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; AVFilterLink *outlink = ctx->outputs[0]; int *hist = s->frames[s->n].histogram; AVHWFramesContext *hw_frames_ctx = (AVHWFramesContext*)s->hw_frames_ctx->data; - AVCUDADeviceContext *device_hwctx = hw_frames_ctx->device_ctx->hwctx; CUcontext dummy; CUDA_MEMCPY2D cpy = { 0 }; int ret = 0; @@ -247,11 +259,11 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *frame) // keep a reference of each frame s->frames[s->n].buf = frame; - ret = CHECK_CU(cuCtxPushCurrent(device_hwctx->cuda_ctx)); + ret = CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx)); if (ret < 0) return ret; - CHECK_CU(cuMemsetD8(s->data, 0, HIST_SIZE * sizeof(int))); + CHECK_CU(cu->cuMemsetD8Async(s->data, 0, HIST_SIZE * sizeof(int), s->cu_stream)); thumbnail(ctx, (int*)s->data, frame); @@ -264,7 +276,7 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *frame) cpy.WidthInBytes = HIST_SIZE * sizeof(int); cpy.Height = 1; - ret = CHECK_CU(cuMemcpy2D(&cpy)); + ret = CHECK_CU(cu->cuMemcpy2DAsync(&cpy, s->cu_stream)); if (ret < 0) return ret; @@ -276,7 +288,7 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *frame) hist[i] = 4 * hist[i]; } - CHECK_CU(cuCtxPopCurrent(&dummy)); + CHECK_CU(cu->cuCtxPopCurrent(&dummy)); if (ret < 0) return ret; @@ -292,14 +304,15 @@ static av_cold void uninit(AVFilterContext *ctx) { int i; ThumbnailCudaContext *s = ctx->priv; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; if (s->data) { - CHECK_CU(cuMemFree(s->data)); + CHECK_CU(cu->cuMemFree(s->data)); s->data = 0; } if (s->cu_module) { - CHECK_CU(cuModuleUnload(s->cu_module)); + CHECK_CU(cu->cuModuleUnload(s->cu_module)); s->cu_module = NULL; } @@ -342,43 +355,43 @@ static int config_props(AVFilterLink *inlink) AVHWFramesContext *hw_frames_ctx = (AVHWFramesContext*)inlink->hw_frames_ctx->data; AVCUDADeviceContext *device_hwctx = hw_frames_ctx->device_ctx->hwctx; CUcontext dummy, cuda_ctx = device_hwctx->cuda_ctx; + CudaFunctions *cu = device_hwctx->internal->cuda_dl; int ret; extern char vf_thumbnail_cuda_ptx[]; - ret = CHECK_CU(cuCtxPushCurrent(cuda_ctx)); + s->hwctx = device_hwctx; + s->cu_stream = s->hwctx->stream; + + ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx)); if (ret < 0) return ret; - ret = CHECK_CU(cuModuleLoadData(&s->cu_module, vf_thumbnail_cuda_ptx)); + ret = CHECK_CU(cu->cuModuleLoadData(&s->cu_module, vf_thumbnail_cuda_ptx)); if (ret < 0) return ret; - CHECK_CU(cuModuleGetFunction(&s->cu_func_uchar, s->cu_module, "Thumbnail_uchar")); - CHECK_CU(cuModuleGetFunction(&s->cu_func_uchar2, s->cu_module, "Thumbnail_uchar2")); - CHECK_CU(cuModuleGetFunction(&s->cu_func_ushort, s->cu_module, "Thumbnail_ushort")); - CHECK_CU(cuModuleGetFunction(&s->cu_func_ushort2, s->cu_module, "Thumbnail_ushort2")); + ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uchar, s->cu_module, "Thumbnail_uchar")); + if (ret < 0) + return ret; - CHECK_CU(cuModuleGetTexRef(&s->cu_tex_uchar, s->cu_module, "uchar_tex")); - CHECK_CU(cuModuleGetTexRef(&s->cu_tex_uchar2, s->cu_module, "uchar2_tex")); - CHECK_CU(cuModuleGetTexRef(&s->cu_tex_ushort, s->cu_module, "ushort_tex")); - CHECK_CU(cuModuleGetTexRef(&s->cu_tex_ushort2, s->cu_module, "ushort2_tex")); + ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uchar2, s->cu_module, "Thumbnail_uchar2")); + if (ret < 0) + return ret; - CHECK_CU(cuTexRefSetFlags(s->cu_tex_uchar, CU_TRSF_READ_AS_INTEGER)); - CHECK_CU(cuTexRefSetFlags(s->cu_tex_uchar2, CU_TRSF_READ_AS_INTEGER)); - CHECK_CU(cuTexRefSetFlags(s->cu_tex_ushort, CU_TRSF_READ_AS_INTEGER)); - CHECK_CU(cuTexRefSetFlags(s->cu_tex_ushort2, CU_TRSF_READ_AS_INTEGER)); + ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_ushort, s->cu_module, "Thumbnail_ushort")); + if (ret < 0) + return ret; - CHECK_CU(cuTexRefSetFilterMode(s->cu_tex_uchar, CU_TR_FILTER_MODE_LINEAR)); - CHECK_CU(cuTexRefSetFilterMode(s->cu_tex_uchar2, CU_TR_FILTER_MODE_LINEAR)); - CHECK_CU(cuTexRefSetFilterMode(s->cu_tex_ushort, CU_TR_FILTER_MODE_LINEAR)); - CHECK_CU(cuTexRefSetFilterMode(s->cu_tex_ushort2, CU_TR_FILTER_MODE_LINEAR)); + ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_ushort2, s->cu_module, "Thumbnail_ushort2")); + if (ret < 0) + return ret; - ret = CHECK_CU(cuMemAlloc(&s->data, HIST_SIZE * sizeof(int))); + ret = CHECK_CU(cu->cuMemAlloc(&s->data, HIST_SIZE * sizeof(int))); if (ret < 0) return ret; - CHECK_CU(cuCtxPopCurrent(&dummy)); + CHECK_CU(cu->cuCtxPopCurrent(&dummy)); s->hw_frames_ctx = ctx->inputs[0]->hw_frames_ctx; diff --git a/libavfilter/vf_thumbnail_cuda.cu b/libavfilter/vf_thumbnail_cuda.cu index 98fad4303a..c73e49fbc6 100644 --- a/libavfilter/vf_thumbnail_cuda.cu +++ b/libavfilter/vf_thumbnail_cuda.cu @@ -22,55 +22,54 @@ extern "C" { -texture uchar_tex; -texture uchar2_tex; -texture ushort_tex; -texture ushort2_tex; - -__global__ void Thumbnail_uchar(int *histogram, int src_width, int src_height) +__global__ void Thumbnail_uchar(cudaTextureObject_t uchar_tex, + int *histogram, int src_width, int src_height) { int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; if (y < src_height && x < src_width) { - unsigned char pixel = tex2D(uchar_tex, x, y); + unsigned char pixel = tex2D(uchar_tex, x, y); atomicAdd(&histogram[pixel], 1); } } -__global__ void Thumbnail_uchar2(int *histogram, int src_width, int src_height) +__global__ void Thumbnail_uchar2(cudaTextureObject_t uchar2_tex, + int *histogram, int src_width, int src_height) { int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; if (y < src_height && x < src_width) { - uchar2 pixel = tex2D(uchar2_tex, x, y); + uchar2 pixel = tex2D(uchar2_tex, x, y); atomicAdd(&histogram[pixel.x], 1); atomicAdd(&histogram[256 + pixel.y], 1); } } -__global__ void Thumbnail_ushort(int *histogram, int src_width, int src_height) +__global__ void Thumbnail_ushort(cudaTextureObject_t ushort_tex, + int *histogram, int src_width, int src_height) { int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; if (y < src_height && x < src_width) { - unsigned short pixel = (tex2D(ushort_tex, x, y) + 128) >> 8; + unsigned short pixel = (tex2D(ushort_tex, x, y) + 128) >> 8; atomicAdd(&histogram[pixel], 1); } } -__global__ void Thumbnail_ushort2(int *histogram, int src_width, int src_height) +__global__ void Thumbnail_ushort2(cudaTextureObject_t ushort2_tex, + int *histogram, int src_width, int src_height) { int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; if (y < src_height && x < src_width) { - ushort2 pixel = tex2D(ushort2_tex, x, y); + ushort2 pixel = tex2D(ushort2_tex, x, y); atomicAdd(&histogram[(pixel.x + 128) >> 8], 1); atomicAdd(&histogram[256 + (pixel.y + 128) >> 8], 1); }