diff --git a/filters.nim b/filters.nim
new file mode 100644
index 0000000..95e8425
--- /dev/null
+++ b/filters.nim
@@ -0,0 +1,104 @@
+proc kernel*[T,U](img: Tensor[T], kernel: Tensor[U], scale: U = 1, offset: U = 0): Tensor[T] =
+  ## Applies a kernel matrix to an image and divides the outputs by scale factor
+  ## and them sun offset.
+  ## For more information see https://en.wikipedia.org/wiki/Kernel_(image_processing)
+  ##
+  ## Implementation details:
+  ## This functions does not flip the kernel, so it does image correlation
+  ## instead of convolution.
+  ## The padding borders of the image is replaced with the nearest neighbourhood
+  ## border.
+
+  assert kernel.width == kernel.height
+  let kernel = kernel.bc([img.channels, kernel.height, kernel.width])
+  let pad = (kernel.width - 1) div 2
+  var correlated_img = img.correlate2d(kernel, pad, PadNearest)
+  if scale != 1.U:
+    correlated_img /= scale
+  if offset != 0.U:
+    correlated_img += offset.bc(correlated_img.shape)
+  result = quantize_bytes(correlated_img, T)
+
+proc filter_blur*[T](img: Tensor[T]): Tensor[T] =
+  ## Blur an image using a predefied kernel
+  img.kernel([[
+    [1,  1,  1,  1,  1],
+    [1,  0,  0,  0,  1],
+    [1,  0,  0,  0,  1],
+    [1,  0,  0,  0,  1],
+    [1,  1,  1,  1,  1]
+  ]].toTensor(), 16)
+
+proc filter_contour*[T](img: Tensor[T]): Tensor[T] =
+  ## Contour an image using a predefied kernel
+  img.kernel([[
+    [-1, -1, -1],
+    [-1,  8, -1],
+    [-1, -1, -1]
+  ]].toTensor(), 1, 255)
+
+proc filter_detail*[T](img: Tensor[T]): Tensor[T] =
+  ## Detail an image using a predefied kernel
+  img.kernel([[
+    [ 0,  -1,  0],
+    [-1,  10, -1],
+    [ 0,  -1,  0],
+  ]].toTensor(), 6)
+
+proc filter_edge_enhance*[T](img: Tensor[T]): Tensor[T] =
+  ## Enhance edges of an image using a predefied kernel
+  img.kernel([[
+    [-1,  -1, -1],
+    [-1,  10, -1],
+    [-1,  -1, -1],
+  ]].toTensor(), 2)
+
+proc filter_edge_enhance_more*[T](img: Tensor[T]): Tensor[T] =
+  ## Enhance edges of an image using a predefied kernel
+  img.kernel([[
+    [-1,  -1, -1],
+    [-1,   9, -1],
+    [-1,  -1, -1],
+  ]].toTensor(), 1)
+
+proc filter_emboss*[T](img: Tensor[T]): Tensor[T] =
+  ## Enhance an image using a predefied kernel
+  img.kernel([[
+    [-1,  0, 0],
+    [0,  1, 0],
+    [0,  0, 0],
+  ]].toTensor(), 1, 128)
+
+proc filter_sharpen*[T](img: Tensor[T]): Tensor[T] =
+  ## Sharpen an image using a predefied kernel
+  img.kernel([[
+    [-2,  -2, -2],
+    [-2,  32, -2],
+    [-2,  -2, -2],
+  ]].toTensor(), 16)
+
+proc filter_smooth*[T](img: Tensor[T]): Tensor[T] =
+  ## Smooth an image using a predefied kernel
+  img.kernel([[
+    [1,  1,  1],
+    [1,  5,  1],
+    [1,  1,  1],
+  ]].toTensor(), 13)
+
+proc filter_find_edges*[T](img: Tensor[T]): Tensor[T] =
+  ## Find edges of image using a predefied kernel
+  img.kernel([[
+    [-1, -1, -1],
+    [-1,  8, -1],
+    [-1, -1, -1],
+  ]].toTensor(), 1)
+
+proc filter_smooth_more*[T](img: Tensor[T]): Tensor[T] =
+  ## Smooth more an image using a predefied kernel
+  img.kernel([[
+    [1,  1,  1,  1,  1],
+    [1,  5,  5,  5,  1],
+    [1,  5, 44,  5,  1],
+    [1,  5,  5,  5,  1],
+    [1,  1,  1,  1,  1]
+  ]].toTensor(), 100)
diff --git a/imageio.nim b/imageio.nim
new file mode 100644
index 0000000..4c4986d
--- /dev/null
+++ b/imageio.nim
@@ -0,0 +1,106 @@
+proc channels*[T](img: Tensor[T]): int {.inline.}  =
+  ## Return number of channels of the image
+  img.shape[^3]
+
+proc height*[T](img: Tensor[T]): int {.inline.} =
+  ## Return height of the image
+  img.shape[^2]
+
+proc width*[T](img: Tensor[T]): int {.inline.}  =
+  ## Return width of the image
+  img.shape[^1]
+
+proc hwc2chw*[T](img: Tensor[T]): Tensor[T] =
+  ## Convert image from HxWxC convetion to the CxHxW convention,
+  ## where C,W,H stands for channels, width, height, note that this library
+  ## only works with CxHxW images for optimization and internal usage reasons
+  ## using CxHxW for images is also a common approach in deep learning
+  img.permute(2, 0, 1)
+
+proc chw2hwc*[T](img: Tensor[T]): Tensor[T] =
+  ## Convert image from CxHxW convetion to the HxWxC convention,
+  ## where C,W,H stands for channels, width, height, note that this library
+  ## only works with CxHxW images for optimization and internal usage reasons
+  ## using CxHxW for images is also a common approach in deep learning
+  img.permute(1, 2, 0)
+
+proc pixels*(img: Tensor[uint8]): seq[uint8] =
+  # Return contiguous pixel data in the HxWxC convetion, method intended
+  # to use for interfacing with other libraries
+  img.chw2hwc().asContiguous().data
+
+proc load*(filename: string, desired_channels: int = 0): Tensor[uint8] =
+  ## Load image from file, with the desired number of channels,
+  ## into a contiguous CxHxW Tensor[uint8]. Desired channels defaults to 0 meaning
+  ## that it will auto detect the number of channels, the returned image tensor
+  ## will be in the CxHxW format even for images with a single channel.
+  ##
+  ## Supports PNG, JPG, BMP, TGA and HDR formats
+  ##
+  ## On error an IOError exception will be thrown
+  var width, height, channels: int
+  try:
+    let pixels = stbi.load(filename, width, height, channels, desired_channels)
+    result = pixels.toTensor.reshape([height, width, channels]).hwc2chw().asContiguous()
+    assert(desired_channels == 0 or channels == desired_channels)
+  except STBIException:
+    raise newException(IOError, getCurrentExceptionMsg())
+
+proc loadFromMemory*(contents: string, desired_channels: int = 0): Tensor[uint8] =
+  ## Like load but loads from memory, the contents must be a buffer
+  ## for a supported image format
+  var width, height, channels: int
+  let pixels = stbi.loadFromMemory(cast[seq[uint8]](toSeq(contents.items)), width, height, channels, desired_channels)
+  result = pixels.toTensor.reshape([height, width, channels]).hwc2chw().asContiguous()
+  assert(desired_channels == 0 or channels == desired_channels)
+
+proc loadFromDir*(dir: string, desired_channels: int = 0): seq[Tensor[uint8]] =
+  ## Load batch of images from a directory into a seq of tensors,
+  ## the load is non recursive, throws an IOError exception on
+  ## error.
+
+  if not dirExists(dir):
+    raise newException(IOError, "Directory not found: " & dir)
+
+  result = newSeq[Tensor[uint8]]()
+  for kind, path in walkDir(dir):
+    if kind == pcFile:
+      result.add(load(path, desired_channels))
+
+  if result.len == 0:
+    raise newException(IOError, "No images found for loading in directory: " & dir)
+
+proc save*(img: Tensor[uint8], filename: string, jpeg_quality: int = 100) =
+  ## Save an image to a file, supports PNG, BMP, TGA and JPG.
+  ## Argument `jpeg_quality` can be passed to inform the saving
+  ## quality from a range 0 to 100, defaults to 100
+  var ok = false
+  if filename.endsWith(".png"):
+    ok = stbiw.writePNG(filename, img.width, img.height, img.channels, img.pixels)
+  elif filename.endsWith(".bmp"):
+    ok = stbiw.writeBMP(filename, img.width, img.height, img.channels, img.pixels)
+  elif filename.endsWith(".tga"):
+    ok = stbiw.writeTGA(filename, img.width, img.height, img.channels, img.pixels)
+  elif filename.endsWith(".jpg"):
+    ok = stbiw.writeJPG(filename, img.width, img.height, img.channels, img.pixels, jpeg_quality)
+
+  if not ok:
+    raise newException(IOError, "Failed to save image to a file: " & filename)
+
+proc toPNG*(img: Tensor[uint8]): seq[byte] =
+  ## Convert an image to PNG into a string of bytes
+  return stbiw.writePNG(img.width, img.height, img.channels, img.pixels)
+
+proc toBMP*(img: Tensor[uint8]): seq[byte] =
+  ## Convert an image to BMP into a string of bytes
+  return stbiw.writeBMP(img.width, img.height, img.channels, img.pixels)
+
+proc toTGA*(img: Tensor[uint8]): seq[byte] =
+  ## Convert an image to TGA into a string of bytes
+  return stbiw.writeTGA(img.width, img.height, img.channels, img.pixels)
+
+proc toJPG*(img: Tensor[uint8], quality: int = 100): seq[byte] =
+  ## Convert an image to JPG into a string of bytes.
+  ## Argument `jpeg_quality` can be passed to inform the saving
+  ## quality from a range 0 to 100, defaults to 100
+  return stbiw.writeJPG(img.width, img.height, img.channels, img.pixels, quality)
diff --git a/scale.nim b/scale.nim
new file mode 100644
index 0000000..e284f6c
--- /dev/null
+++ b/scale.nim
@@ -0,0 +1,81 @@
+type
+  ScaleMode* = enum
+    ScaleNearest = 0
+    ScaleBilinear = 1
+
+proc round_pixel(a: float32, U: typedesc): U {.inline.} =
+  when U is uint8:
+    clamp(a + 0.5.float32, low(U).float32, high(U).float32).uint8
+  elif U is float32:
+    a.float32
+
+proc scale_nearest[T](src: Tensor[T], width, height: int): Tensor[T] {.inline.} =
+  result = newTensor[T]([src.channels, height, width])
+  let
+    step_x = src.height.float32 / height.float32
+    step_y = src.width.float32 / width.float32
+  for c in 0..<src.channels:
+    for y in 0..<height:
+      let sy = (y.float32 * step_y).int
+      for x in 0..<width:
+        let sx = (x.float32 * step_x).int
+        result[c, y, x] = src[c, sy, sx]
+
+proc scale_linear_vertical[T](src: Tensor[T], width, height: int): Tensor[T] {.inline.} =
+  result = newTensor[T]([src.channels, height, width])
+  let
+    step_y = src.height.float32 / height.float32
+    max_sy = src.height - 1
+  for c in 0..<src.channels:
+    for y in 0..<height:
+      let
+        sy = y.float32 * step_y
+        say = sy.int
+        sby = min(say+1, max_sy)
+        sa_factor = sby.float32 - sy
+        sb_factor = 1.0f - sa_factor
+      for x in 0..<width:
+        let
+          sx = x
+          sa = src[c, say, sx].float32 * sa_factor
+          sb = src[c, sby, sx].float32 * sb_factor
+        result[c, y, x] = round_pixel(sa + sb, T)
+
+proc scale_linear_horizontal[T](src: Tensor[T], width, height: int): Tensor[T] {.inline.} =
+  result = newTensor[T]([src.channels, height, width])
+  let
+    step_x = src.width.float32 / width.float32
+    max_sx = src.width - 1
+  for c in 0..<src.channels:
+    for y in 0..<height:
+      let sy = y
+      for x in 0..<width:
+        let
+          sx = x.float32 * step_x
+          sax = sx.int
+          sbx = min(sax+1, max_sx)
+          sa_factor = sbx.float32 - sx
+          sb_factor = 1.0f - sa_factor
+          sa = src[c, sy, sax].float32 * sa_factor
+          sb = src[c, sy, sbx].float32 * sb_factor
+        result[c, y, x] = round_pixel(sa + sb, T)
+
+proc scale_bilinear[T](src: Tensor[T], width, height: int): Tensor[T] {.inline.} =
+  var tmp : Tensor[T]
+  if height != src.height:
+    tmp = scale_linear_vertical(src, src.width, height)
+  else:
+    shallowCopy(tmp, src)
+  if width != src.width:
+    result = scale_linear_horizontal(tmp, width, height)
+  else:
+    result = tmp
+
+proc scale*[T](src: Tensor[T], width, height: int, mode: ScaleMode = ScaleNearest): Tensor[T] =
+  ## Scale an image to a new size, suppored modes are nearest, and bilinear,
+  ## defaults to nearest.
+  case mode:
+    of ScaleNearest:
+      return scale_nearest(src, width, height)
+    of ScaleBilinear:
+      return scale_bilinear(src, width, height)
diff --git a/transform.nim b/transform.nim
new file mode 100644
index 0000000..a83e39e
--- /dev/null
+++ b/transform.nim
@@ -0,0 +1,152 @@
+proc hflip*(img: Tensor[uint8]): Tensor[uint8] {.inline.} =
+  ## Horizontal flips an image
+  result = img[_, _, ^1..0|-1]
+
+proc vflip*(img: Tensor[uint8]): Tensor[uint8] {.inline.} =
+  ## Vertical flips an image
+  result = img[_, ^1..0|-1, _]
+
+proc vhflip*(img: Tensor[uint8]): Tensor[uint8] {.inline.} =
+  ## Flip vertically and horizontally an image
+  result = img[_, ^1..0|-1, ^1..0|-1]
+
+proc crop*(img: Tensor[uint8], x, y, width, height: int): Tensor[uint8] {.inline.} =
+  ## Crop an image
+  result = img[_, y..<(y+height), x..<(x+width)]
+
+proc center_crop*[T](img: Tensor[T], width, height: int): Tensor[T] {.inline.} =
+  ## Crop an image to center
+  let
+    x = (img.width - width) div 2
+    y = (img.height - height) div 2
+  result = img.crop(x, y, width, height)
+
+proc random_crop*[T](img: Tensor[T], width, height: int): Tensor[T] {.inline.} =
+  ## Random crop an image
+  let
+    x = random(img.width - width + 1)
+    y = random(img.height - height + 1)
+  result = img.crop(x, y, width, height)
+
+proc rot90*[T](img: Tensor[T], k: int): Tensor[T] =
+  ## Rotate an image 90 degrees clockwise `k` times
+  case k mod 4:
+    of 0:
+      result = img
+    of 1:
+      result = img.permute(0,2,1).hflip
+    of 2:
+      result = img.vhflip
+    else:
+      result = img.permute(0,2,1).vflip
+
+proc quantize_bytes*[T: SomeNumber](img: Tensor[T], U: typedesc): Tensor[U] =
+  ## Quantize image bytes, from type T to U, useful for converting
+  ## images from floats to ints
+  # TODO: nim bugs issue #6406 and #6407, ugly workaround solution:
+  when U is uint8:
+    result = img.map(proc(x: T): U = clamp(x + 0.5.T, low(U).T, high(U).T).uint8)
+  else:
+    static: assert false
+
+proc quantize_bytes*[T: SomeInteger](img: Tensor[T], U: typedesc): Tensor[U] =
+  ## Quantize image bytes, from type T to U, useful for converting
+  ## images from floats to ints
+  # TODO: nim bugs issue #6406 and #6407, ugly workaround solution:
+  when U is uint8:
+    img.map(proc(x: T): U = clamp(x, low(U).T, high(U).T).uint8)
+  elif U is int:
+    img.map(proc(x: T): U = clamp(x, low(U).T, high(U).T).int)
+  else:
+    static: assert false
+
+type
+  PadMode* = enum
+    PadConstant = 0
+    PadNearest = 1
+
+proc im2col*[T](input: Tensor[T], ksize: int, pad: int = 0, mode: PadMode, pad_constant: int): Tensor[T] =
+  ## Convert blocks of an image into columns, useful for preprocessing
+  ## an image before convolutions, pad mode.
+  let
+    channels = input.channels
+    height = input.height
+    width = input.width
+    channels_col = channels * ksize * ksize
+    height_col = height + (2 * pad) - ksize + 1
+    width_col = width + (2 * pad) - ksize + 1
+  result = newTensorUninit[T]([channels_col, height_col * width_col])
+  for c in 0..<channels_col:
+    let
+      w_offset = (c mod ksize) - pad
+      h_offset = ((c div ksize) mod ksize) - pad
+      c_offset = (c div ksize) div ksize
+    for h in 0..<height_col:
+      let
+        row = h_offset + h
+        offset_col = h * width_col
+      for w in 0..<width_col:
+        let col = w_offset + w
+        if row < 0 or col < 0 or row >= height or col >= width:
+          case mode:
+            of PadConstant:
+              result[c, offset_col + w] = pad_constant
+            of PadNearest:
+              result[c, offset_col + w] = input[c_offset, clamp(row, 0, height-1), clamp(col, 0, width-1)]
+        else:
+          result[c, offset_col + w] = input[c_offset, row, col]
+
+proc correlate2d*[T,U](input: Tensor[T], weights: Tensor[U], pad: int = 0, mode: PadMode = PadConstant, cval: U = 0): Tensor[int] =
+  ## Correlate an image with the given kernel weights, this is a convolution
+  ## without flipping the kernel
+  let ksize = weights.width
+
+  assert input.rank == 3
+  assert weights.rank == 3
+  assert weights.width == weights.height
+  assert ksize > 0 and ksize mod 2 == 1
+
+  let
+    channels = input.channels
+    height = input.height + (2 * pad) - ksize + 1
+    width = input.width + (2 * pad) - ksize + 1
+    channel_ksize = ksize*ksize
+
+  var w = weights.reshape([channels, 1, ksize*ksize])
+  var x = im2col(input.astype(U), ksize, pad, mode, cval).reshape([channels, channel_ksize, height*width])
+  var res_channels = newSeq[Tensor[U]](channels)
+
+  for c in 0..<channels:
+    res_channels[c] = (w.unsafeAt(c) * x.unsafeAt(c)).reshape([1, height, width])
+
+  result = concat(res_channels, 0)
+
+proc convolve2d*(input: Tensor[uint8], weights: Tensor[int], pad: int, mode: PadMode = PadConstant, cval: int = 0): Tensor[int] =
+  ## Convolve an image with the given kernel weights, like correlate but
+  ## it flips the kernel before.
+  let flipped_weights = weights[_, ^1..0|-1, ^1..0|-1]
+  result = correlate2d(input, flipped_weights, pad, mode, cval)
+
+proc tile_collection*(imgs: Tensor[uint8], max_width: int = 0): Tensor[uint8] =
+  ## Combine multiple images into one big tiled image and returns it.
+  ## The new generated image width will be at maximum the given max width if
+  ## supplied, otherwise will be calculated to create a square image.
+  assert imgs.rank == 4
+  let
+    count = imgs.shape[0]
+    cell_width = imgs.width
+    cell_height = imgs.height
+  var cols : int
+  if max_width == 0:
+    cols = ceil(sqrt(count.float)).int
+  else:
+    cols = max_width div cell_width
+  var rows = ceil(count / cols).int
+
+  result = zeros[uint8]([imgs.channels, rows*cell_height, cols*cell_width])
+  for i in 0..<count:
+    let
+      y = (i div cols) * cell_height
+      x = (i mod cols) * cell_width
+    result[_, y..<(y+cell_height), x..<(x+cell_width)] = imgs[i, _, _, _].squeeze(0)
+
diff --git a/utils.nim b/utils.nim
new file mode 100644
index 0000000..d7596d8
--- /dev/null
+++ b/utils.nim
@@ -0,0 +1,2 @@
+template unsafeAt[T](t: Tensor[T], x: int): Tensor[T] =
+  t[x, _, _].reshape([t.shape[1], t.shape[2]])
diff --git a/visdom.nim b/visdom.nim
new file mode 100644
index 0000000..6602eee
--- /dev/null
+++ b/visdom.nim
@@ -0,0 +1,61 @@
+import httpclient, json, base64
+
+proc webEncodeData(data: seq[byte], mimeType: string): string =
+  return "data:image/" & mimeType & ";base64," & base64.encode(data, newLine="")
+
+proc postJson(url: string, params: JsonNode) =
+  let body = $params
+  let client = newHttpClient()
+  client.headers = newHttpHeaders({
+    "content-type": "application/json",
+    "content-length": $body.len
+  })
+  let response = client.request(url, httpMethod = HttpPost, body = body)
+  if response.code != Http200:
+    raise newException(IOError, "Failed to post json data")
+
+type
+  VisdomClient = object
+    host: string
+    port: int
+
+proc newVisdomClient*(host: string = "localhost", port: int = 8097): VisdomClient =
+  ## Prepare a visdom client for visualization
+  result.host = host
+  result.port = port
+
+proc sendEvent(self: VisdomClient, opts: JsonNode, data: JsonNode, window: string) =
+  let params = %*{
+    "eid": "main",
+    "opts": opts,
+    "data": data
+  }
+
+  if window.len > 0:
+    params["win"] = % window
+
+  let url = "http://" & self.host & ":" & $self.port & "/events"
+  postJson(url, params)
+
+proc image*(vis: VisdomClient,
+  img: Tensor[uint8],
+  window: string = "",
+  caption: string = "",
+  title: string = "") =
+  ## Show image into visdom with the given title and specified window
+
+  let opts = %*{
+    "title": if title.len > 0: title else: window,
+    "height": img.height,
+    "width": img.width
+  }
+
+  let data = %*[{
+    "content": {
+      "src": img.toJPG().webEncodeData("image/jpg"),
+      "caption": caption
+    },
+    "type": "image"
+  }]
+
+  vis.sendEvent(opts, data, window)