Convolution Neural Networks

Conv filters

cfg = OmegaConf.load('../config/data/image/mnist.yaml')
dm = instantiate(cfg)
dm.prepare_data()
dm.setup()
[14:49:25] INFO - Init ImageDataModule for mnist
[14:49:29] INFO - loading dataset mnist with args () from split train
[14:49:29] INFO - loading dataset mnist from split train
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
Cell In[6], line 4
      2 cfg = OmegaConf.load('../config/data/image/mnist.yaml')
      3 dm = instantiate(cfg)
----> 4 dm.prepare_data()
      5 dm.setup()

File ~/Projects/nimrod/nimrod/image/datasets.py:415, in ImageDataModule.prepare_data(self)
    412 """Download data if needed 
    413 """
    414 # train set
--> 415 self.train_ds = ImageDataset(
    416     self.hparams.name,
    417     *self.args,
    418     data_dir = self.hparams.data_dir,
    419     split='train',
    420     transforms = self.hparams.transforms,
    421     **self.kwargs
    422 )
    423 # get num classes before setup method converst ImageDataset to Subset
    424 self._num_classes = self.train_ds.num_classes

File ~/Projects/nimrod/nimrod/image/datasets.py:228, in ImageDataset.__init__(self, name, data_dir, split, transforms, streaming, exclude_grey_scale, verification_mode, from_image_folder, from_disk, *args)
    226 else:
    227     logger.info(f"loading dataset {name} from split {split}")
--> 228     self.hf_ds = load_dataset(
    229         name,
    230         *args,
    231         split=split,
    232         cache_dir=data_dir,
    233         download_mode='reuse_dataset_if_exists',
    234         streaming=streaming,
    235         verification_mode=verification_mode
    236         
    237     )
    239 # CHANGE IMAGE COLUMN NAME IF NEEDED
    240 self.image_column_name = 'image'

File ~/miniconda3/envs/nimrod/lib/python3.11/site-packages/datasets/load.py:2129, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, keep_in_memory, save_infos, revision, token, streaming, num_proc, storage_options, trust_remote_code, **config_kwargs)
   2124 verification_mode = VerificationMode(
   2125     (verification_mode or VerificationMode.BASIC_CHECKS) if not save_infos else VerificationMode.ALL_CHECKS
   2126 )
   2128 # Create a dataset builder
-> 2129 builder_instance = load_dataset_builder(
   2130     path=path,
   2131     name=name,
   2132     data_dir=data_dir,
   2133     data_files=data_files,
   2134     cache_dir=cache_dir,
   2135     features=features,
   2136     download_config=download_config,
   2137     download_mode=download_mode,
   2138     revision=revision,
   2139     token=token,
   2140     storage_options=storage_options,
   2141     trust_remote_code=trust_remote_code,
   2142     _require_default_config_name=name is None,
   2143     **config_kwargs,
   2144 )
   2146 # Return iterable dataset in case of streaming
   2147 if streaming:

File ~/miniconda3/envs/nimrod/lib/python3.11/site-packages/datasets/load.py:1886, in load_dataset_builder(path, name, data_dir, data_files, cache_dir, features, download_config, download_mode, revision, token, storage_options, trust_remote_code, _require_default_config_name, **config_kwargs)
   1884 builder_cls = get_dataset_builder_class(dataset_module, dataset_name=dataset_name)
   1885 # Instantiate the dataset builder
-> 1886 builder_instance: DatasetBuilder = builder_cls(
   1887     cache_dir=cache_dir,
   1888     dataset_name=dataset_name,
   1889     config_name=config_name,
   1890     data_dir=data_dir,
   1891     data_files=data_files,
   1892     hash=dataset_module.hash,
   1893     info=info,
   1894     features=features,
   1895     token=token,
   1896     storage_options=storage_options,
   1897     **builder_kwargs,
   1898     **config_kwargs,
   1899 )
   1900 builder_instance._use_legacy_cache_dir_if_possible(dataset_module)
   1902 return builder_instance

File ~/miniconda3/envs/nimrod/lib/python3.11/site-packages/datasets/builder.py:342, in DatasetBuilder.__init__(self, cache_dir, dataset_name, config_name, hash, base_path, info, features, token, repo_id, data_files, data_dir, storage_options, writer_batch_size, **config_kwargs)
    340     config_kwargs["data_dir"] = data_dir
    341 self.config_kwargs = config_kwargs
--> 342 self.config, self.config_id = self._create_builder_config(
    343     config_name=config_name,
    344     custom_features=features,
    345     **config_kwargs,
    346 )
    348 # prepare info: DatasetInfo are a standardized dataclass across all datasets
    349 # Prefill datasetinfo
    350 if info is None:
    351     # TODO FOR PACKAGED MODULES IT IMPORTS DATA FROM src/packaged_modules which doesn't make sense

File ~/miniconda3/envs/nimrod/lib/python3.11/site-packages/datasets/builder.py:597, in DatasetBuilder._create_builder_config(self, config_name, custom_features, **config_kwargs)
    594     raise ValueError(f"BuilderConfig must have a name, got {builder_config.name}")
    596 # resolve data files if needed
--> 597 builder_config._resolve_data_files(
    598     base_path=self.base_path,
    599     download_config=DownloadConfig(token=self.token, storage_options=self.storage_options),
    600 )
    602 # compute the config id that is going to be used for caching
    603 config_id = builder_config.create_config_id(
    604     config_kwargs,
    605     custom_features=custom_features,
    606 )

File ~/miniconda3/envs/nimrod/lib/python3.11/site-packages/datasets/builder.py:206, in BuilderConfig._resolve_data_files(self, base_path, download_config)
    204 if isinstance(self.data_files, DataFilesPatternsDict):
    205     base_path = xjoin(base_path, self.data_dir) if self.data_dir else base_path
--> 206     self.data_files = self.data_files.resolve(base_path, download_config)

File ~/miniconda3/envs/nimrod/lib/python3.11/site-packages/datasets/data_files.py:818, in DataFilesPatternsDict.resolve(self, base_path, download_config)
    816 out = DataFilesDict()
    817 for key, data_files_patterns_list in self.items():
--> 818     out[key] = data_files_patterns_list.resolve(base_path, download_config)
    819 return out

File ~/miniconda3/envs/nimrod/lib/python3.11/site-packages/datasets/data_files.py:771, in DataFilesPatternsList.resolve(self, base_path, download_config)
    768 for pattern, allowed_extensions in zip(self, self.allowed_extensions):
    769     try:
    770         data_files.extend(
--> 771             resolve_pattern(
    772                 pattern,
    773                 base_path=base_path,
    774                 allowed_extensions=allowed_extensions,
    775                 download_config=download_config,
    776             )
    777         )
    778     except FileNotFoundError:
    779         if not has_magic(pattern):

File ~/miniconda3/envs/nimrod/lib/python3.11/site-packages/datasets/data_files.py:388, in resolve_pattern(pattern, base_path, allowed_extensions, download_config)
    383 if protocol == "hf" and config.HF_HUB_VERSION >= version.parse("0.20.0"):
    384     # 10 times faster glob with detail=True (ignores costly info like lastCommit)
    385     glob_kwargs["expand_info"] = False
    386 matched_paths = [
    387     filepath if filepath.startswith(protocol_prefix) else protocol_prefix + filepath
--> 388     for filepath, info in fs.glob(pattern, detail=True, **glob_kwargs).items()
    389     if info["type"] == "file"
    390     and (xbasename(filepath) not in files_to_ignore)
    391     and not _is_inside_unrequested_special_dir(filepath, fs_pattern)
    392     and not _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(filepath, fs_pattern)
    393 ]  # ignore .ipynb and __pycache__, but keep /../
    394 if allowed_extensions is not None:
    395     out = [
    396         filepath
    397         for filepath in matched_paths
    398         if any("." + suffix in allowed_extensions for suffix in xbasename(filepath).split(".")[1:])
    399     ]

File ~/miniconda3/envs/nimrod/lib/python3.11/site-packages/huggingface_hub/hf_file_system.py:521, in HfFileSystem.glob(self, path, **kwargs)
    519 kwargs = {"expand_info": kwargs.get("detail", False), **kwargs}
    520 path = self.resolve_path(path, revision=kwargs.get("revision")).unresolve()
--> 521 return super().glob(path, **kwargs)

File ~/miniconda3/envs/nimrod/lib/python3.11/site-packages/fsspec/spec.py:611, in AbstractFileSystem.glob(self, path, maxdepth, **kwargs)
    608     else:
    609         depth = None
--> 611 allpaths = self.find(root, maxdepth=depth, withdirs=True, detail=True, **kwargs)
    613 pattern = glob_translate(path + ("/" if ends_with_sep else ""))
    614 pattern = re.compile(pattern)

File ~/miniconda3/envs/nimrod/lib/python3.11/site-packages/huggingface_hub/hf_file_system.py:556, in HfFileSystem.find(self, path, maxdepth, withdirs, detail, refresh, revision, **kwargs)
    533 """
    534 List all files below path.
    535 
   (...)
    553     `Union[List[str], Dict[str, Dict[str, Any]]]`: List of paths or dict of file information.
    554 """
    555 if maxdepth:
--> 556     return super().find(
    557         path, maxdepth=maxdepth, withdirs=withdirs, detail=detail, refresh=refresh, revision=revision, **kwargs
    558     )
    559 resolved_path = self.resolve_path(path, revision=revision)
    560 path = resolved_path.unresolve()

File ~/miniconda3/envs/nimrod/lib/python3.11/site-packages/fsspec/spec.py:502, in AbstractFileSystem.find(self, path, maxdepth, withdirs, detail, **kwargs)
    499 # Add the root directory if withdirs is requested
    500 # This is needed for posix glob compliance
    501 if withdirs and path != "" and self.isdir(path):
--> 502     out[path] = self.info(path)
    504 for _, dirs, files in self.walk(path, maxdepth, detail=True, **kwargs):
    505     if withdirs:

File ~/miniconda3/envs/nimrod/lib/python3.11/site-packages/huggingface_hub/hf_file_system.py:719, in HfFileSystem.info(self, path, refresh, revision, **kwargs)
    717     out = out1[0]
    718 if refresh or out is None or (expand_info and out and out["last_commit"] is None):
--> 719     paths_info = self._api.get_paths_info(
    720         resolved_path.repo_id,
    721         resolved_path.path_in_repo,
    722         expand=expand_info,
    723         revision=resolved_path.revision,
    724         repo_type=resolved_path.repo_type,
    725     )
    726     if not paths_info:
    727         _raise_file_not_found(path, None)

File ~/miniconda3/envs/nimrod/lib/python3.11/site-packages/huggingface_hub/utils/_validators.py:114, in validate_hf_hub_args.<locals>._inner_fn(*args, **kwargs)
    111 if check_use_auth_token:
    112     kwargs = smoothly_deprecate_use_auth_token(fn_name=fn.__name__, has_token=has_token, kwargs=kwargs)
--> 114 return fn(*args, **kwargs)

File ~/miniconda3/envs/nimrod/lib/python3.11/site-packages/huggingface_hub/hf_api.py:3295, in HfApi.get_paths_info(self, repo_id, paths, expand, revision, repo_type, token)
   3292 revision = quote(revision, safe="") if revision is not None else constants.DEFAULT_REVISION
   3293 headers = self._build_hf_headers(token=token)
-> 3295 response = get_session().post(
   3296     f"{self.endpoint}/api/{repo_type}s/{repo_id}/paths-info/{revision}",
   3297     data={
   3298         "paths": paths if isinstance(paths, list) else [paths],
   3299         "expand": expand,
   3300     },
   3301     headers=headers,
   3302 )
   3303 hf_raise_for_status(response)
   3304 paths_info = response.json()

File ~/miniconda3/envs/nimrod/lib/python3.11/site-packages/requests/sessions.py:637, in Session.post(self, url, data, json, **kwargs)
    626 def post(self, url, data=None, json=None, **kwargs):
    627     r"""Sends a POST request. Returns :class:`Response` object.
    628 
    629     :param url: URL for the new :class:`Request` object.
   (...)
    634     :rtype: requests.Response
    635     """
--> 637     return self.request("POST", url, data=data, json=json, **kwargs)

File ~/miniconda3/envs/nimrod/lib/python3.11/site-packages/requests/sessions.py:589, in Session.request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
    584 send_kwargs = {
    585     "timeout": timeout,
    586     "allow_redirects": allow_redirects,
    587 }
    588 send_kwargs.update(settings)
--> 589 resp = self.send(prep, **send_kwargs)
    591 return resp

File ~/miniconda3/envs/nimrod/lib/python3.11/site-packages/requests/sessions.py:724, in Session.send(self, request, **kwargs)
    721 if allow_redirects:
    722     # Redirect resolving generator.
    723     gen = self.resolve_redirects(r, request, **kwargs)
--> 724     history = [resp for resp in gen]
    725 else:
    726     history = []

File ~/miniconda3/envs/nimrod/lib/python3.11/site-packages/requests/sessions.py:724, in <listcomp>(.0)
    721 if allow_redirects:
    722     # Redirect resolving generator.
    723     gen = self.resolve_redirects(r, request, **kwargs)
--> 724     history = [resp for resp in gen]
    725 else:
    726     history = []

File ~/miniconda3/envs/nimrod/lib/python3.11/site-packages/requests/sessions.py:265, in SessionRedirectMixin.resolve_redirects(self, resp, req, stream, timeout, verify, cert, proxies, yield_requests, **adapter_kwargs)
    263     yield req
    264 else:
--> 265     resp = self.send(
    266         req,
    267         stream=stream,
    268         timeout=timeout,
    269         verify=verify,
    270         cert=cert,
    271         proxies=proxies,
    272         allow_redirects=False,
    273         **adapter_kwargs,
    274     )
    276     extract_cookies_to_jar(self.cookies, prepared_request, resp.raw)
    278     # extract redirect url, if any, for the next loop

File ~/miniconda3/envs/nimrod/lib/python3.11/site-packages/requests/sessions.py:703, in Session.send(self, request, **kwargs)
    700 start = preferred_clock()
    702 # Send the request
--> 703 r = adapter.send(request, **kwargs)
    705 # Total elapsed time of the request (approximately)
    706 elapsed = preferred_clock() - start

File ~/miniconda3/envs/nimrod/lib/python3.11/site-packages/huggingface_hub/utils/_http.py:93, in UniqueRequestIdAdapter.send(self, request, *args, **kwargs)
     91 """Catch any RequestException to append request id to the error message for debugging."""
     92 try:
---> 93     return super().send(request, *args, **kwargs)
     94 except requests.RequestException as e:
     95     request_id = request.headers.get(X_AMZN_TRACE_ID)

File ~/miniconda3/envs/nimrod/lib/python3.11/site-packages/requests/adapters.py:667, in HTTPAdapter.send(self, request, stream, timeout, verify, cert, proxies)
    664     timeout = TimeoutSauce(connect=timeout, read=timeout)
    666 try:
--> 667     resp = conn.urlopen(
    668         method=request.method,
    669         url=url,
    670         body=request.body,
    671         headers=request.headers,
    672         redirect=False,
    673         assert_same_host=False,
    674         preload_content=False,
    675         decode_content=False,
    676         retries=self.max_retries,
    677         timeout=timeout,
    678         chunked=chunked,
    679     )
    681 except (ProtocolError, OSError) as err:
    682     raise ConnectionError(err, request=request)

File ~/miniconda3/envs/nimrod/lib/python3.11/site-packages/urllib3/connectionpool.py:787, in HTTPConnectionPool.urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, preload_content, decode_content, **response_kw)
    784 response_conn = conn if not release_conn else None
    786 # Make the request on the HTTPConnection object
--> 787 response = self._make_request(
    788     conn,
    789     method,
    790     url,
    791     timeout=timeout_obj,
    792     body=body,
    793     headers=headers,
    794     chunked=chunked,
    795     retries=retries,
    796     response_conn=response_conn,
    797     preload_content=preload_content,
    798     decode_content=decode_content,
    799     **response_kw,
    800 )
    802 # Everything went great!
    803 clean_exit = True

File ~/miniconda3/envs/nimrod/lib/python3.11/site-packages/urllib3/connectionpool.py:534, in HTTPConnectionPool._make_request(self, conn, method, url, body, headers, retries, timeout, chunked, response_conn, preload_content, decode_content, enforce_content_length)
    532 # Receive the response from the server
    533 try:
--> 534     response = conn.getresponse()
    535 except (BaseSSLError, OSError) as e:
    536     self._raise_timeout(err=e, url=url, timeout_value=read_timeout)

File ~/miniconda3/envs/nimrod/lib/python3.11/site-packages/urllib3/connection.py:516, in HTTPConnection.getresponse(self)
    513 _shutdown = getattr(self.sock, "shutdown", None)
    515 # Get the response from http.client.HTTPConnection
--> 516 httplib_response = super().getresponse()
    518 try:
    519     assert_header_parsing(httplib_response.msg)

File ~/miniconda3/envs/nimrod/lib/python3.11/http/client.py:1390, in HTTPConnection.getresponse(self)
   1388 try:
   1389     try:
-> 1390         response.begin()
   1391     except ConnectionError:
   1392         self.close()

File ~/miniconda3/envs/nimrod/lib/python3.11/http/client.py:325, in HTTPResponse.begin(self)
    323 # read until we get a non-100 response
    324 while True:
--> 325     version, status, reason = self._read_status()
    326     if status != CONTINUE:
    327         break

File ~/miniconda3/envs/nimrod/lib/python3.11/http/client.py:286, in HTTPResponse._read_status(self)
    285 def _read_status(self):
--> 286     line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
    287     if len(line) > _MAXLINE:
    288         raise LineTooLong("status line")

File ~/miniconda3/envs/nimrod/lib/python3.11/socket.py:706, in SocketIO.readinto(self, b)
    704 while True:
    705     try:
--> 706         return self._sock.recv_into(b)
    707     except timeout:
    708         self._timeout_occurred = True

File ~/miniconda3/envs/nimrod/lib/python3.11/ssl.py:1314, in SSLSocket.recv_into(self, buffer, nbytes, flags)
   1310     if flags != 0:
   1311         raise ValueError(
   1312           "non-zero flags not allowed in calls to recv_into() on %s" %
   1313           self.__class__)
-> 1314     return self.read(nbytes, buffer)
   1315 else:
   1316     return super().recv_into(buffer, nbytes, flags)

File ~/miniconda3/envs/nimrod/lib/python3.11/ssl.py:1166, in SSLSocket.read(self, len, buffer)
   1164 try:
   1165     if buffer is not None:
-> 1166         return self._sslobj.read(len, buffer)
   1167     else:
   1168         return self._sslobj.read(len)

KeyboardInterrupt: 
x, y = dm.train_ds[0]
print(x.shape)
plt.figure(figsize=(3,3))
plt.imshow(x.squeeze(), cmap='gray')
torch.Size([1, 28, 28])

top_kernel = torch.tensor( # torch.tensor infers datatype vs. torch.Tensor
    [[-1., -1., -1.],
     [0., 0., 0.],
     [1., 1., 1.]]
)

bottom_kernel = torch.tensor( # torch.tensor infers datatype vs. torch.Tensor
    [[1., 1., 1.],
     [0., 0., 0.],
     [-1., -1., -1.]]
)

left_kernel = torch.tensor( # torch.tensor infers datatype vs. torch.Tensor
    [[-1., 0., 1.],
     [-1., 0., 1.],
     [-1., 0., 1.]]
)
my_kernel = left_kernel

c = nn.Conv2d(1, 1, kernel_size=3, padding=1, bias=False)
with torch.no_grad():
    c.weight.copy_(my_kernel)

print(x.shape)
y = c(x)
print(y.shape)
plt.figure(figsize=(3,3))
plt.imshow(y.squeeze().detach(), cmap='gray')
plt.title('Convolution')

dc = nn.ConvTranspose2d(1, 1, kernel_size=3, padding=1, bias=False)
# with torch.no_grad():
#     dc.weight.copy_(my_kernel)

x_bar = dc(y)
print(x_bar.shape)
plt.figure(figsize=(3,3))
plt.imshow(x_bar.squeeze().detach(), cmap='gray')
plt.title('Convolution transpose')
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[9], line 3
      1 #| notest
----> 3 my_kernel = left_kernel
      5 c = nn.Conv2d(1, 1, kernel_size=3, padding=1, bias=False)
      6 with torch.no_grad():

NameError: name 'left_kernel' is not defined

Conv Block

Using a convolution with a stride of 2 instead of max pooling essentially achieves the same goal of downsampling an image by reducing its spatial dimensions, but with the key difference that the convolution layer can learn more complex feature combinations from overlapping regions, while max pooling only selects the maximum value within a window, potentially losing information about the finer details within that region; making the convolution with stride approach often preferred for preserving more spatial information in a neural network.


source

ConvBlock

 ConvBlock (in_channels:int=3, out_channels:int=16, kernel_size:int=3,
            stride:int=2, bias:bool=True, normalization:Optional[Type[torc
            h.nn.modules.module.Module]]=<class
            'torch.nn.modules.batchnorm.BatchNorm2d'>, activation:Optional
            [Type[torch.nn.modules.module.Module]]=<class
            'torch.nn.modules.activation.ReLU'>)

*Base class for all neural network modules.

Your models should also subclass this class.

Modules can also contain other Modules, allowing to nest them in a tree structure. You can assign the submodules as regular attributes::

import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 20, 5)
        self.conv2 = nn.Conv2d(20, 20, 5)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        return F.relu(self.conv2(x))

Submodules assigned in this way will be registered, and will have their parameters converted too when you call :meth:to, etc.

.. note:: As per the example above, an __init__() call to the parent class must be made before assignment on the child.

ivar training: Boolean represents whether this module is in training or evaluation mode. :vartype training: bool*
Type Default Details
in_channels int 3 input channels
out_channels int 16 output channels
kernel_size int 3 kernel size
stride int 2 stride
bias bool True bias is False if BatchNorm
normalization Optional BatchNorm2d normalization
activation Optional ReLU activation

Usage

B, C, H, W = 64, 1, 28, 28
X = torch.rand(B, C, H,W)
# stride 2 layer downsample to (W/2, H/2)
model = ConvBlock(
    in_channels=C,
    out_channels=16,
    kernel_size=3,
    stride=2,
    bias=True,
    normalization=nn.BatchNorm2d,
    )

# get first layer of sequential and init weights
layer_0 = model.net[0] # get first layer of sequential
with torch.no_grad():
    model.net[0].weight.copy_(top_kernel)

print("Y: ", model(X).shape)
# # flatten all dims except batch dim 1
Y = torch.flatten(model(X), 1)
print(Y.shape)
summary(model, input_size=(B, C, H, W), depth=2)
[13:58:07] WARNING - setting conv bias back to False as Batchnorm is used
Y:  torch.Size([64, 16, 14, 14])
torch.Size([64, 3136])
==========================================================================================
Layer (type:depth-idx)                   Output Shape              Param #
==========================================================================================
ConvBlock                                [64, 16, 14, 14]          --
├─Sequential: 1-1                        [64, 16, 14, 14]          --
│    └─Conv2d: 2-1                       [64, 16, 14, 14]          144
│    └─BatchNorm2d: 2-2                  [64, 16, 14, 14]          32
│    └─ReLU: 2-3                         [64, 16, 14, 14]          --
==========================================================================================
Total params: 176
Trainable params: 176
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 1.81
==========================================================================================
Input size (MB): 0.20
Forward/backward pass size (MB): 3.21
Params size (MB): 0.00
Estimated Total Size (MB): 3.41
==========================================================================================
nn.Sequential(
    ConvBlock(1, 8),
    ConvBlock(8, 16),
    ConvBlock(16, 32),
    ConvBlock(32, 16)
    )(X).shape
[13:58:09] WARNING - setting conv bias back to False as Batchnorm is used
[13:58:09] WARNING - setting conv bias back to False as Batchnorm is used
[13:58:09] WARNING - setting conv bias back to False as Batchnorm is used
[13:58:09] WARNING - setting conv bias back to False as Batchnorm is used
torch.Size([64, 16, 2, 2])

Configs

cfg = OmegaConf.load('../config/model/image/convblock.yaml')
net = instantiate(cfg.defaults)
B, C, H, W = 64, 1, 28, 28
X = torch.rand(B, C, H,W)
print(summary(net))
print("Y: ",net(X).shape)
Seed set to 42
[14:49:51] WARNING - setting conv bias back to False as Batchnorm is used
=================================================================
Layer (type:depth-idx)                   Param #
=================================================================
ConvBlock                                --
├─Sequential: 1-1                        --
│    └─Conv2d: 2-1                       144
│    └─BatchNorm2d: 2-2                  32
│    └─ReLU: 2-3                         --
=================================================================
Total params: 176
Trainable params: 176
Non-trainable params: 0
=================================================================
Y:  torch.Size([64, 16, 14, 14])

Pre-Activation Conv Block


source

PreActivationConvBlock

 PreActivationConvBlock (in_channels:int=3, out_channels:int=16,
                         kernel_size:int=3, stride:int=2, bias:bool=True, 
                         normalization:Optional[Type[torch.nn.modules.modu
                         le.Module]]=<class
                         'torch.nn.modules.batchnorm.BatchNorm2d'>, activa
                         tion:Optional[Type[torch.nn.modules.module.Module
                         ]]=<class 'torch.nn.modules.activation.ReLU'>)

*Base class for all neural network modules.

Your models should also subclass this class.

Modules can also contain other Modules, allowing to nest them in a tree structure. You can assign the submodules as regular attributes::

import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 20, 5)
        self.conv2 = nn.Conv2d(20, 20, 5)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        return F.relu(self.conv2(x))

Submodules assigned in this way will be registered, and will have their parameters converted too when you call :meth:to, etc.

.. note:: As per the example above, an __init__() call to the parent class must be made before assignment on the child.

ivar training: Boolean represents whether this module is in training or evaluation mode. :vartype training: bool*
Type Default Details
in_channels int 3 input channels
out_channels int 16 output channels
kernel_size int 3 kernel size
stride int 2 stride
bias bool True
normalization Optional BatchNorm2d
activation Optional ReLU
B, C, H, W = 64, 1, 28, 28
X = torch.rand(B, C, H,W)
# stride 2 layer downsample to (W/2, H/2)
model = PreActivationConvBlock(
    in_channels=C,
    out_channels=16,
    kernel_size=3,
    stride=2,
    bias=True,
    normalization=nn.BatchNorm2d,
    )

# get last layer of sequential and init weights
layer_0 = model.net[0] # get first layer of sequential
with torch.no_grad():
    model.net[-1].weight.copy_(top_kernel)

print("Y: ", model(X).shape)
# # flatten all dims except batch dim 1
Y = torch.flatten(model(X), 1)
print(Y.shape)
summary(model, input_size=(B, C, H, W), depth=2)
[13:58:16] WARNING - setting conv bias back to False as Batchnorm is used
Y:  torch.Size([64, 16, 14, 14])
torch.Size([64, 3136])
==========================================================================================
Layer (type:depth-idx)                   Output Shape              Param #
==========================================================================================
PreActivationConvBlock                   [64, 16, 14, 14]          --
├─Sequential: 1-1                        [64, 16, 14, 14]          --
│    └─BatchNorm2d: 2-1                  [64, 1, 28, 28]           2
│    └─ReLU: 2-2                         [64, 1, 28, 28]           --
│    └─Conv2d: 2-3                       [64, 16, 14, 14]          144
==========================================================================================
Total params: 146
Trainable params: 146
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 1.81
==========================================================================================
Input size (MB): 0.20
Forward/backward pass size (MB): 2.01
Params size (MB): 0.00
Estimated Total Size (MB): 2.21
==========================================================================================

Deconv Block


source

DeconvBlock

 DeconvBlock (in_channels:int=16, out_channels:int=3, kernel_size:int=3,
              bias:bool=True, normalization:Optional[Type[torch.nn.modules
              .module.Module]]=None, activation:Optional[Type[torch.nn.mod
              ules.module.Module]]=<class
              'torch.nn.modules.activation.ReLU'>, scale_factor:int=2,
              use_transposed_conv:bool=False)

*Base class for all neural network modules.

Your models should also subclass this class.

Modules can also contain other Modules, allowing to nest them in a tree structure. You can assign the submodules as regular attributes::

import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 20, 5)
        self.conv2 = nn.Conv2d(20, 20, 5)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        return F.relu(self.conv2(x))

Submodules assigned in this way will be registered, and will have their parameters converted too when you call :meth:to, etc.

.. note:: As per the example above, an __init__() call to the parent class must be made before assignment on the child.

ivar training: Boolean represents whether this module is in training or evaluation mode. :vartype training: bool*
Type Default Details
in_channels int 16 input channels
out_channels int 3 output channels
kernel_size int 3 kernel size
bias bool True
normalization Optional None
activation Optional ReLU
scale_factor int 2
use_transposed_conv bool False

Usage

B, C, H, W = 64, 3, 28, 28
X = torch.rand(B, C, H, W)
deconv = DeconvBlock(3, 8, scale_factor=2, kernel_size=3, use_transposed_conv=True)
print(deconv)
print("Y: ",deconv(X).shape)
DeconvBlock(
  (_net): Sequential(
    (0): ConvTranspose2d(3, 8, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), output_padding=(1, 1))
    (1): ReLU()
  )
)
Y:  torch.Size([64, 8, 56, 56])

Conv-Deconv

# one image
x, y = dm.train_ds[0]
C, H, W = x.shape
# make fake batch dimension
x = x.unsqueeze(0)
print("x:", x.shape)
plt.figure(figsize=(3,3))
plt.imshow(x.squeeze(), cmap='gray')
plt.title("original image")

my_kernel = left_kernel

c = ConvBlock(1,3, kernel_size=3, stride=1)
with torch.no_grad():
    c.net[0].weight.copy_(my_kernel) # set kernel weights for convlayer 0 (actual convolution2d)

y = c(x)
print("y: ", y.shape)

plt.figure(figsize=(3,3))
plt.imshow(y.detach().squeeze().numpy().transpose(1, 2, 0), cmap='gray')
plt.title("filtered image")


dc = DeconvBlock(3, 1, scale_factor=2, kernel_size=3)
with torch.no_grad():
    dc._net[1].weight.copy_(my_kernel) # set kernel weights for convlayer 1 (actual convolution2d)
x_bar = dc(y)
print("x_bar: ", x_bar.shape)
plt.figure(figsize=(3,3))
plt.imshow(x_bar.detach().squeeze(), cmap='gray')
plt.title("Deconv image")
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[14], line 4
      1 #| notest
      2 
      3 # one image
----> 4 x, y = dm.train_ds[0]
      5 C, H, W = x.shape
      6 # make fake batch dimension

TypeError: 'NoneType' object is not subscriptable

ConvNet

Simple convolution network for image recognition


source

ConvNet

 ConvNet (n_features:List[int]=[1, 8, 16, 32, 64, 128],
          num_classes:int=10, kernel_size:int=3, bias:bool=False,
          normalization:torch.nn.modules.module.Module=<class
          'torch.nn.modules.batchnorm.BatchNorm2d'>,
          activation:torch.nn.modules.module.Module=<class
          'torch.nn.modules.activation.ReLU'>)

*Base class for all neural network modules.

Your models should also subclass this class.

Modules can also contain other Modules, allowing to nest them in a tree structure. You can assign the submodules as regular attributes::

import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 20, 5)
        self.conv2 = nn.Conv2d(20, 20, 5)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        return F.relu(self.conv2(x))

Submodules assigned in this way will be registered, and will have their parameters converted too when you call :meth:to, etc.

.. note:: As per the example above, an __init__() call to the parent class must be made before assignment on the child.

ivar training: Boolean represents whether this module is in training or evaluation mode. :vartype training: bool*
Type Default Details
n_features List [1, 8, 16, 32, 64, 128] channel/feature expansion
num_classes int 10 num_classes
kernel_size int 3 kernel size
bias bool False conv2d bias
normalization Module BatchNorm2d normalization (before activation)
activation Module ReLU activation function

Usage

# data
B, C, H, W = 64, 3, 64, 64
X = torch.rand(B, C, H, W)
X.shape

n_features = [3, 8, 16, 32, 64, 128] #28 14 7 4 2 1
n_features = [3, 8, 16, 32, 64, 128, 64] #64, 32, 16, 8, 4, 2, 1
num_classes = 20

convnet = ConvNet(
    n_features=n_features, # channel/feature expansion
    num_classes=num_classes, # num_classes
    kernel_size=3, # kernel size
    bias=False, # conv2d bias
    normalization=nn.BatchNorm2d, # normalization (before activation)
    activation=nn.ReLU,
)
out = convnet(X)
print(out.shape)
print(summary(convnet, input_size=(X.shape), depth=2))
torch.Size([64, 20])
==========================================================================================
Layer (type:depth-idx)                   Output Shape              Param #
==========================================================================================
ConvNet                                  [64, 20]                  --
├─Sequential: 1-1                        [64, 20]                  --
│    └─ConvBlock: 2-1                    [64, 8, 64, 64]           232
│    └─ConvBlock: 2-2                    [64, 16, 32, 32]          1,184
│    └─ConvBlock: 2-3                    [64, 32, 16, 16]          4,672
│    └─ConvBlock: 2-4                    [64, 64, 8, 8]            18,560
│    └─ConvBlock: 2-5                    [64, 128, 4, 4]           73,984
│    └─ConvBlock: 2-6                    [64, 64, 2, 2]            73,856
│    └─ConvBlock: 2-7                    [64, 20, 1, 1]            11,560
│    └─Flatten: 2-8                      [64, 20]                  --
==========================================================================================
Total params: 184,048
Trainable params: 184,048
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 378.27
==========================================================================================
Input size (MB): 3.15
Forward/backward pass size (MB): 65.29
Params size (MB): 0.74
Estimated Total Size (MB): 69.18
==========================================================================================
# from config
cfg = OmegaConf.load('../config/model/image/convnet.yaml')
# print(cfg.defaults)
# convnet = instantiate(cfg.defaults)
print(cfg.batchnorm)
convnet = instantiate(cfg.baseline)

# print(convnet(X).shape)
{'_target_': 'nimrod.models.conv.ConvNet', 'n_features': [1, 8, 16, 32, 64, 128], 'num_classes': 10, 'kernel_size': 3, 'bias': False, 'normalization': {'_target_': 'hydra.utils.get_class', 'path': 'torch.nn.BatchNorm2d'}, 'activation': {'_target_': 'hydra.utils.get_class', 'path': 'torch.nn.ReLU'}}

Training

Dataloaders

# data module config
cfg = OmegaConf.load('../config/data/image/fashion_mnist.yaml')

BATCH_SIZE = 512
datamodule = instantiate(cfg, batch_size=BATCH_SIZE)
datamodule.prepare_data()
datamodule.setup()

# one data point 
X,y = datamodule.test_ds[0]
print("X (C,H,W): ", X.shape, "y: ", y)

# a batch of data via dataloader
XX,YY = next(iter(datamodule.test_dataloader()))
print("XX (B,C,H,W): ", XX.shape, "YY: ", YY.shape)

print(len(datamodule.train_ds))
print(len(datamodule.train_ds) // BATCH_SIZE)
[16:28:58] INFO - Init ImageDataModule for fashion_mnist
[16:29:17] INFO - split train into train/val [0.8, 0.2]
[16:29:17] INFO - train: 48000 val: 12000, test: 10000
X (C,H,W):  torch.Size([1, 32, 32]) y:  9
XX (B,C,H,W):  torch.Size([512, 1, 32, 32]) YY:  torch.Size([512])
48000
93

Model & hardware

device = get_device()
print(device)
cfg = OmegaConf.load('../config/model/image/convnet.yaml')
# print(cfg.defaults)
# convnet = instantiate(cfg.defaults)
print(cfg.baseline)
convnet = instantiate(cfg.baseline)
model = convnet.to(device)

summary(model, input_size=(B, C, H, W), depth=4)
[16:29:17] INFO - Using device: mps
mps
{'_target_': 'nimrod.models.conv.ConvNet', 'n_features': [1, 8, 16, 32, 64], 'num_classes': 10, 'kernel_size': 3, 'bias': True, 'normalization': None, 'activation': {'_target_': 'hydra.utils.get_class', 'path': 'torch.nn.ReLU'}}
==========================================================================================
Layer (type:depth-idx)                   Output Shape              Param #
==========================================================================================
ConvNet                                  [64, 40]                  --
├─Sequential: 1-1                        [64, 40]                  --
│    └─ConvLayer: 2-1                    [64, 8, 28, 28]           --
│    │    └─Sequential: 3-1              [64, 8, 28, 28]           --
│    │    │    └─Conv2d: 4-1             [64, 8, 28, 28]           80
│    │    │    └─ReLU: 4-2               [64, 8, 28, 28]           --
│    └─ConvLayer: 2-2                    [64, 16, 14, 14]          --
│    │    └─Sequential: 3-2              [64, 16, 14, 14]          --
│    │    │    └─Conv2d: 4-3             [64, 16, 14, 14]          1,168
│    │    │    └─ReLU: 4-4               [64, 16, 14, 14]          --
│    └─ConvLayer: 2-3                    [64, 32, 7, 7]            --
│    │    └─Sequential: 3-3              [64, 32, 7, 7]            --
│    │    │    └─Conv2d: 4-5             [64, 32, 7, 7]            4,640
│    │    │    └─ReLU: 4-6               [64, 32, 7, 7]            --
│    └─ConvLayer: 2-4                    [64, 64, 4, 4]            --
│    │    └─Sequential: 3-4              [64, 64, 4, 4]            --
│    │    │    └─Conv2d: 4-7             [64, 64, 4, 4]            18,496
│    │    │    └─ReLU: 4-8               [64, 64, 4, 4]            --
│    └─ConvLayer: 2-5                    [64, 10, 2, 2]            --
│    │    └─Sequential: 3-5              [64, 10, 2, 2]            --
│    │    │    └─Conv2d: 4-9             [64, 10, 2, 2]            5,770
│    └─Flatten: 2-6                      [64, 40]                  --
==========================================================================================
Total params: 30,154
Trainable params: 30,154
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 53.63
==========================================================================================
Input size (MB): 0.20
Forward/backward pass size (MB): 6.16
Params size (MB): 0.12
Estimated Total Size (MB): 6.49
==========================================================================================

LR finder

cfg = OmegaConf.load('../config/model/image/convnet.yaml')
model = instantiate(cfg.batchnorm)
print(summary(model, depth=4))


criterion = nn.CrossEntropyLoss()    
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) #, weight_decay=1e-5)
    
# Initialize LR Finder
lr_finder = LRFinder(model, optimizer, criterion, device=device)
    
# Run LR range test
lr_finder.range_test(
    datamodule.train_dataloader(),
    start_lr=1e-5,      # Extremely small starting learning rate
    end_lr=10,          # Large ending learning rate
    num_iter=100,   # Number of iterations to test
    smooth_f=0.05,   # Smoothing factor for the loss
    diverge_th=5, 
)
    
# Plot the learning rate vs loss
_, lr_found = lr_finder.plot(log_lr=True)
print('Suggested lr:', lr_found)
    
lr_finder.reset()
=================================================================
Layer (type:depth-idx)                   Param #
=================================================================
ConvNet                                  --
├─Sequential: 1-1                        --
│    └─ConvLayer: 2-1                    --
│    │    └─Sequential: 3-1              --
│    │    │    └─Conv2d: 4-1             72
│    │    │    └─BatchNorm2d: 4-2        16
│    │    │    └─ReLU: 4-3               --
│    └─ConvLayer: 2-2                    --
│    │    └─Sequential: 3-2              --
│    │    │    └─Conv2d: 4-4             1,152
│    │    │    └─BatchNorm2d: 4-5        32
│    │    │    └─ReLU: 4-6               --
│    └─ConvLayer: 2-3                    --
│    │    └─Sequential: 3-3              --
│    │    │    └─Conv2d: 4-7             4,608
│    │    │    └─BatchNorm2d: 4-8        64
│    │    │    └─ReLU: 4-9               --
│    └─ConvLayer: 2-4                    --
│    │    └─Sequential: 3-4              --
│    │    │    └─Conv2d: 4-10            18,432
│    │    │    └─BatchNorm2d: 4-11       128
│    │    │    └─ReLU: 4-12              --
│    └─ConvLayer: 2-5                    --
│    │    └─Sequential: 3-5              --
│    │    │    └─Conv2d: 4-13            73,728
│    │    │    └─BatchNorm2d: 4-14       256
│    │    │    └─ReLU: 4-15              --
│    └─ConvLayer: 2-6                    --
│    │    └─Sequential: 3-6              --
│    │    │    └─Conv2d: 4-16            11,530
│    └─Flatten: 2-7                      --
=================================================================
Total params: 110,018
Trainable params: 110,018
Non-trainable params: 0
=================================================================
Stopping early, the loss has diverged
Learning rate search finished. See the graph with {finder_name}.plot()
LR suggestion: steepest gradient
Suggested LR: 2.01E-03

Suggested lr: 0.0020092330025650463

1-cycle warm-up

device = get_device()
# data module config
cfg_dm = OmegaConf.load('../config/data/image/fashion_mnist.yaml')
cfg_dm.batch_size = 512
datamodule = instantiate(cfg_dm)
datamodule.prepare_data()
datamodule.setup()

# device = 'cpu'
print(device)
cfg_mdl = OmegaConf.load('../config/model/image/convnet.yaml')
convnet = instantiate(cfg_mdl.batchnorm)
model = convnet.to(device)

N_EPOCHS = 5

lr_found = 3e-4

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
steps_per_epoch = len(datamodule.train_ds) // cfg_dm.batch_size
total_steps = steps_per_epoch* N_EPOCHS
print(f"size training set: {len(datamodule.train_ds)}, bs: {cfg_dm.batch_size}, steps/epoch: {steps_per_epoch}, total steps: {total_steps}")
# scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.01, steps_per_epoch=steps_per_epochs, epochs=1)

scheduler = torch.optim.lr_scheduler.OneCycleLR(
        optimizer,
        max_lr=lr_found,  # Peak learning rate
        # total_steps=len(datamodule.train_ds) * N_EPOCHS,  # Total training iterations
        steps_per_epoch=steps_per_epoch,
        epochs=N_EPOCHS,
        pct_start=0.3,  # 30% of training increasing LR, 70% decreasing
        anneal_strategy='cos',  # Cosine annealing
        div_factor=10,  # Initial lr = max_lr / div_factor
        # final_div_factor=1e4,
        three_phase=False  # Two phase LR schedule (increase then decrease)
    )

################################


lrs = []
current_step = 0
train_loss_history = []
eval_loss_history = []
avg_train_loss_hist = []
avg_eval_loss_hist = []
max_acc = 0

for epoch in range(N_EPOCHS):
    i = 0
    model.train()
    for images, labels in datamodule.train_dataloader():
        if current_step >= total_steps:
            print(f"Reached total steps: {current_step}/{total_steps}")
            break
        optimizer.zero_grad()
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        loss = criterion(outputs, labels)        
        loss.backward()
        optimizer.step()
        scheduler.step()    
        current_step += 1
        train_loss_history.append(loss.item())
        # current_lr = scheduler.get_last_lr()[0]
        current_lr = optimizer.param_groups[0]['lr']
        lrs.append(current_lr)
        if not (i % 100):
            print(f"Loss {loss.item():.4f}, Current LR: {current_lr:.10f}, Step: {current_step}/{total_steps}")
        i += 1

    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        for images, labels in datamodule.val_dataloader():
            # model expects input (B,H*W)
            images = images.to(device)
            labels = labels.to(device)
            # Pass the input through the model
            outputs = model(images)
            # eval loss
            eval_loss = criterion(outputs, labels)
            eval_loss_history.append(eval_loss.item())
            # Get the predicted labels
            _, predicted = torch.max(outputs.data, 1)

            # Update the total and correct counts
            total += labels.size(0)
            correct += (predicted == labels).sum()
            acc = 100 * correct / total
            if acc > max_acc:
                max_acc = acc

        # Print the accuracy
    print(f"Epoch {epoch + 1}: Last training Loss {loss.item():.4f}, Last Eval loss {eval_loss.item():.4f} Accuracy = {100 * correct / total:.2f}% Best Accuracy: {max_acc:.2f}")
    # print(f'Current LR: {optimizer.param_groups[0]["lr"]:.5f}')

###################
plt.figure(1)
plt.subplot(211)
plt.ylabel('loss')
plt.xlabel('step')
plt.plot(train_loss_history)
plt.plot(eval_loss_history)
plt.subplot(212)
plt.ylabel('lr')
plt.xlabel('step')
plt.plot(lrs)
Seed set to 42
Seed set to 42
[23:31:47] INFO - Init ImageDataModule for fashion_mnist
[23:31:52] INFO - loading dataset fashion_mnist with args () from split train
[23:32:00] INFO - loading dataset fashion_mnist with args () from split test
[23:32:03] INFO - split train into train/val [0.8, 0.2]
[23:32:03] INFO - train: 48000 val: 12000, test: 10000
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[9], line 11
      8 datamodule.setup()
     10 # device = 'cpu'
---> 11 print(device)
     12 cfg_mdl = OmegaConf.load('../config/model/image/convnet.yaml')
     13 convnet = instantiate(cfg_mdl.batchnorm)

NameError: name 'device' is not defined

ConvNetX


source

ConvNetX

 ConvNetX (nnet:__main__.ConvNet, num_classes:int,
           optimizer:Callable[...,torch.optim.optimizer.Optimizer],
           scheduler:Optional[Callable[...,Any]]=None)
Type Default Details
nnet ConvNet model
num_classes int number of classes
optimizer Callable optimizer
scheduler Optional None scheduler

Usage

cfg = OmegaConf.load('../config/model/image/convnetx.yaml')
feats_dim = [3, 8, 16, 32, 64, 128, 64]
cfg.nnet.n_features = feats_dim
cfg.nnet.num_classes = 200

model = instantiate(cfg.nnet)
B, C, H, W = 64, 3, 64, 64
X = torch.rand(B, C, H, W)
X.shape
print(model(X).shape)
torch.Size([64, 200])
summary(model, input_size=(B, C, H, W), depth=2)
==========================================================================================
Layer (type:depth-idx)                   Output Shape              Param #
==========================================================================================
ConvNet                                  [64, 200]                 --
├─Sequential: 1-1                        [64, 200]                 --
│    └─ConvBlock: 2-1                    [64, 8, 64, 64]           232
│    └─ConvBlock: 2-2                    [64, 16, 32, 32]          1,184
│    └─ConvBlock: 2-3                    [64, 32, 16, 16]          4,672
│    └─ConvBlock: 2-4                    [64, 64, 8, 8]            18,560
│    └─ConvBlock: 2-5                    [64, 128, 4, 4]           73,984
│    └─ConvBlock: 2-6                    [64, 64, 2, 2]            73,856
│    └─ConvBlock: 2-7                    [64, 200, 1, 1]           115,600
│    └─Flatten: 2-8                      [64, 200]                 --
==========================================================================================
Total params: 288,088
Trainable params: 288,088
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 384.93
==========================================================================================
Input size (MB): 3.15
Forward/backward pass size (MB): 65.48
Params size (MB): 1.15
Estimated Total Size (MB): 69.78
==========================================================================================

Nimrod training

N_EPOCHS = 5

# data module config
cfg = OmegaConf.load('../config/data/image/fashion_mnist.yaml')
cfg.batch_size = 512
cfg.num_workers = 0
datamodule = instantiate(cfg)
datamodule.prepare_data()
datamodule.setup()
[20:23:50] INFO - Init ImageDataModule for fashion_mnist
[20:24:08] INFO - split train into train/val [0.8, 0.2]
[20:24:08] INFO - train: 48000 val: 12000, test: 10000
cfg = OmegaConf.load('../config/optimizer/adam_w.yaml')
optimizer = instantiate(cfg)

cfg = OmegaConf.load('../config/scheduler/step_lr.yaml')
scheduler = instantiate(cfg)

cfg = OmegaConf.load('../config/model/image/convnetx.yaml')
model = instantiate(cfg)(optimizer=optimizer, scheduler=scheduler)

# # with 1-cycle sched
# cfg.nnet.n_features = [1, 8, 16, 32, 64, 128]
# cfg.scheduler.total_steps = len(datamodule.train_ds) * N_EPOCHS
# model = instantiate(cfg)
[14:53:05] INFO - ConvNetX: init
[14:53:05] INFO - Classifier: init
/user/s/slegroux/miniconda3/envs/nimrod/lib/python3.11/site-packages/lightning/pytorch/utilities/parsing.py:209: Attribute 'nnet' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['nnet'])`.
trainer = Trainer(
    accelerator="auto",
    max_epochs=N_EPOCHS,
    logger=TensorBoardLogger("tb_logs", name="fashion_mnist_convnet", default_hp_metric=True),
    # logger=CSVLogger("logs", name="mnist_convnet"),
    callbacks = [LearningRateMonitor(logging_interval="step")],
    check_val_every_n_epoch=1,
    log_every_n_steps=1
    )
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

LR finder

tuner = Tuner(trainer)
lr_finder = tuner.lr_find(
    model,
    datamodule=datamodule,
    min_lr=1e-5,
    max_lr=1.0,
    num_training=100,  # number of iterations
    # attr_name="optimizer.lr",
)
fig = lr_finder.plot(suggest=True)
plt.show()
print(f"Suggested learning rate: {lr_finder.suggestion()}")
[20:59:14] INFO - Optimizer: <class 'torch.optim.adamw.AdamW'>
[20:59:14] INFO - Scheduler: <torch.optim.lr_scheduler.StepLR object>
/Users/slegroux/miniforge3/envs/nimrod/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
/Users/slegroux/miniforge3/envs/nimrod/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
`Trainer.fit` stopped: `max_steps=100` reached.
Learning rate set to 0.0019952623149688807
Restoring states from the checkpoint path at /Users/slegroux/Projects/nimrod/nbs/.lr_find_61a6646e-2298-4940-9b72-9185e67e8d21.ckpt
Restored all states from the checkpoint at /Users/slegroux/Projects/nimrod/nbs/.lr_find_61a6646e-2298-4940-9b72-9185e67e8d21.ckpt

Suggested learning rate: 0.0019952623149688807
print(trainer.max_epochs, len(datamodule.train_ds), datamodule.hparams.batch_size)
print(5*56000)
print(5*56000/2048)
print(5*56000//2048)
10 48000 512
280000
136.71875
136

1-cycle scheduling

N_EPOCHS = 1
# lr_found = lr_finder.suggestion()
lr_found = 3e-4

# DATA
cfg = OmegaConf.load('../config/data/image/mnist.yaml')
cfg.batch_size = 512
cfg.num_workers = 0
datamodule = instantiate(cfg)
datamodule.prepare_data()
datamodule.setup()

checkpoint_callback = ModelCheckpoint(
    monitor='val/loss',  # Metric to monitor
    dirpath='checkpoints/',  # Directory to save checkpoints
    filename='epoch{epoch:02d}-val_loss{val/loss:.2f}',
    auto_insert_metric_name=False,
    save_top_k=1,  # Save only the best checkpoint
    mode='min'  # Mode can be 'min' or 'max' depending on the metric
)

lr_monitor = LearningRateMonitor(logging_interval="step")

# TRAINER 
trainer = Trainer(
    accelerator="auto",
    max_epochs=N_EPOCHS,
    # logger=TensorBoardLogger("tb_logs", name="mnist_convnet", default_hp_metric=True),
    logger=CSVLogger("logs", name="fashion_mnist_convnet"),
    callbacks = [lr_monitor, checkpoint_callback],
    check_val_every_n_epoch=1,
    log_every_n_steps=1
    )

print("estimated steps: ", trainer.estimated_stepping_batches, "accumulate_grad_batches: ", trainer.accumulate_grad_batches)

# MODEL
model_cfg = OmegaConf.load('../config/model/image/convnetx.yaml')
model_cfg.scheduler.total_steps = trainer.max_epochs * len(datamodule.train_dataloader())
model_cfg.scheduler.max_lr = lr_found#lr_finder.suggestion()

model = instantiate(model_cfg)

print("LR: ",model.lr)
trainer.fit(model, datamodule.train_dataloader(), datamodule.val_dataloader())

########################
csv_path = f"{trainer.logger.log_dir}/metrics.csv"
metrics = pd.read_csv(csv_path)
metrics.head()

##########################
plt.figure()
plt.plot(metrics['step'], metrics['train/loss_step'], 'b.-')
plt.plot(metrics['step'], metrics['val/loss'],'r.-')
plt.figure()
plt.plot(metrics['step'], metrics['lr-AdamW'], 'g.-')
plt.show()
[23:33:22] INFO - Init ImageDataModule for mnist
[23:33:26] INFO - loading dataset mnist with args () from split train
[23:33:33] INFO - loading dataset mnist with args () from split test
[23:33:36] INFO - split train into train/val [0.8, 0.2]
[23:33:36] INFO - train: 48000 val: 12000, test: 10000
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Loading `train_dataloader` to estimate number of stepping batches.
estimated steps:  -1 accumulate_grad_batches:  1
---------------------------------------------------------------------------
ConfigAttributeError                      Traceback (most recent call last)
Cell In[10], line 41
     39 # MODEL
     40 model_cfg = OmegaConf.load('../config/model/image/convnetx.yaml')
---> 41 model_cfg.scheduler.total_steps = trainer.max_epochs * len(datamodule.train_dataloader())
     42 model_cfg.scheduler.max_lr = lr_found#lr_finder.suggestion()
     44 model = instantiate(model_cfg)

File ~/miniforge3/envs/nimrod/lib/python3.11/site-packages/omegaconf/dictconfig.py:355, in DictConfig.__getattr__(self, key)
    351     return self._get_impl(
    352         key=key, default_value=_DEFAULT_MARKER_, validate_key=False
    353     )
    354 except ConfigKeyError as e:
--> 355     self._format_and_raise(
    356         key=key, value=None, cause=e, type_override=ConfigAttributeError
    357     )
    358 except Exception as e:
    359     self._format_and_raise(key=key, value=None, cause=e)

File ~/miniforge3/envs/nimrod/lib/python3.11/site-packages/omegaconf/base.py:231, in Node._format_and_raise(self, key, value, cause, msg, type_override)
    223 def _format_and_raise(
    224     self,
    225     key: Any,
   (...)
    229     type_override: Any = None,
    230 ) -> None:
--> 231     format_and_raise(
    232         node=self,
    233         key=key,
    234         value=value,
    235         msg=str(cause) if msg is None else msg,
    236         cause=cause,
    237         type_override=type_override,
    238     )
    239     assert False

File ~/miniforge3/envs/nimrod/lib/python3.11/site-packages/omegaconf/_utils.py:899, in format_and_raise(node, key, value, msg, cause, type_override)
    896     ex.ref_type = ref_type
    897     ex.ref_type_str = ref_type_str
--> 899 _raise(ex, cause)

File ~/miniforge3/envs/nimrod/lib/python3.11/site-packages/omegaconf/_utils.py:797, in _raise(ex, cause)
    795 else:
    796     ex.__cause__ = None
--> 797 raise ex.with_traceback(sys.exc_info()[2])

File ~/miniforge3/envs/nimrod/lib/python3.11/site-packages/omegaconf/dictconfig.py:351, in DictConfig.__getattr__(self, key)
    348     raise AttributeError()
    350 try:
--> 351     return self._get_impl(
    352         key=key, default_value=_DEFAULT_MARKER_, validate_key=False
    353     )
    354 except ConfigKeyError as e:
    355     self._format_and_raise(
    356         key=key, value=None, cause=e, type_override=ConfigAttributeError
    357     )

File ~/miniforge3/envs/nimrod/lib/python3.11/site-packages/omegaconf/dictconfig.py:442, in DictConfig._get_impl(self, key, default_value, validate_key)
    438 def _get_impl(
    439     self, key: DictKeyType, default_value: Any, validate_key: bool = True
    440 ) -> Any:
    441     try:
--> 442         node = self._get_child(
    443             key=key, throw_on_missing_key=True, validate_key=validate_key
    444         )
    445     except (ConfigAttributeError, ConfigKeyError):
    446         if default_value is not _DEFAULT_MARKER_:

File ~/miniforge3/envs/nimrod/lib/python3.11/site-packages/omegaconf/basecontainer.py:73, in BaseContainer._get_child(self, key, validate_access, validate_key, throw_on_missing_value, throw_on_missing_key)
     64 def _get_child(
     65     self,
     66     key: Any,
   (...)
     70     throw_on_missing_key: bool = False,
     71 ) -> Union[Optional[Node], List[Optional[Node]]]:
     72     """Like _get_node, passing through to the nearest concrete Node."""
---> 73     child = self._get_node(
     74         key=key,
     75         validate_access=validate_access,
     76         validate_key=validate_key,
     77         throw_on_missing_value=throw_on_missing_value,
     78         throw_on_missing_key=throw_on_missing_key,
     79     )
     80     if isinstance(child, UnionNode) and not _is_special(child):
     81         value = child._value()

File ~/miniforge3/envs/nimrod/lib/python3.11/site-packages/omegaconf/dictconfig.py:480, in DictConfig._get_node(self, key, validate_access, validate_key, throw_on_missing_value, throw_on_missing_key)
    478 if value is None:
    479     if throw_on_missing_key:
--> 480         raise ConfigKeyError(f"Missing key {key!s}")
    481 elif throw_on_missing_value and value._is_missing():
    482     raise MissingMandatoryValue("Missing mandatory value: $KEY")

ConfigAttributeError: Missing key scheduler
    full_key: scheduler
    object_type=dict
trainer.test(model, datamodule.test_dataloader(), ckpt_path="best")
Restoring states from the checkpoint path at /Users/slegroux/Projects/nimrod/nbs/checkpoints/epoch00-val_loss0.13.ckpt
Loaded model weights from the checkpoint at /Users/slegroux/Projects/nimrod/nbs/checkpoints/epoch00-val_loss0.13.ckpt
/Users/slegroux/miniforge3/envs/nimrod/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃        Test metric               DataLoader 0        ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│         test/acc              0.9707000255584717     │
│         test/loss             0.11260439455509186    │
└───────────────────────────┴───────────────────────────┘
[{'test/loss': 0.11260439455509186, 'test/acc': 0.9707000255584717}]
best_checkpoint_path = checkpoint_callback.best_model_path
print(f"Best checkpoint path: {best_checkpoint_path}")
Best checkpoint path: /Users/slegroux/Projects/nimrod/nbs/checkpoints/epoch00-val_loss0.13.ckpt

Resume training

cfg = OmegaConf.load('../config/scheduler/reduce_lr_on_plateau.yaml')
sched = instantiate(cfg)
# sched.total_steps = len(datamodule.train_ds) * N_EPOCHS
lr = trainer.optimizers[0].param_groups[0]['lr']
print(f"LR: {lr}")
model = ConvNetX.load_from_checkpoint(best_checkpoint_path,scheduler=sched, lr=lr)

pprint(model.hparams)
[20:56:41] INFO - ConvNetX: init
[20:56:41] INFO - Classifier: init
LR: 7.642883799445691e-07
"nnet":        ConvNet(
  (net): Sequential(
    (0): ConvLayer(
      (net): Sequential(
        (0): Conv2d(1, 8, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (1): BatchNorm2d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU()
      )
    )
    (1): ConvLayer(
      (net): Sequential(
        (0): Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU()
      )
    )
    (2): ConvLayer(
      (net): Sequential(
        (0): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU()
      )
    )
    (3): ConvLayer(
      (net): Sequential(
        (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU()
      )
    )
    (4): ConvLayer(
      (net): Sequential(
        (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU()
      )
    )
    (5): ConvLayer(
      (net): Sequential(
        (0): Conv2d(128, 10, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
      )
    )
    (6): Flatten(start_dim=1, end_dim=-1)
  )
)
"num_classes": 10
"optimizer":   functools.partial(<class 'torch.optim.adamw.AdamW'>, lr=0.0001, weight_decay=1e-05)
"scheduler":   functools.partial(<class 'torch.optim.lr_scheduler.ReduceLROnPlateau'>, mode='min', factor=0.1, patience=10)
/Users/slegroux/miniforge3/envs/nimrod/lib/python3.11/site-packages/lightning/pytorch/utilities/parsing.py:208: Attribute 'nnet' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['nnet'])`.
# batchnorm should allow us to try higher LR
# model_cfg = OmegaConf.load('../config/model/image/convnetx_adam.yaml')
# model_cfg.optimizer.lr = 0.1

# model = instantiate(model_cfg)
# opt = instantiate(model_cfg.optimizer)
# print(opt)
# sched = instantiate(model_cfg.scheduler)
# print(sched)


N_EPOCHS = 3

trainer = Trainer(
    accelerator="auto",
    max_epochs=N_EPOCHS,
    # logger=TensorBoardLogger("tb_logs", name="mnist_convnet", default_hp_metric=True),
    logger=CSVLogger("logs", name="fashion_mnist_convnet"),
    callbacks = [LearningRateMonitor(logging_interval="step")],
    check_val_every_n_epoch=1,
    log_every_n_steps=1
    )

# use standar adam scheduler

# retrieve last ckpt
trainer.fit(model, datamodule.train_dataloader(), datamodule.val_dataloader(), ckpt_path=best_checkpoint_path)
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Restoring states from the checkpoint path at /Users/slegroux/Projects/nimrod/nbs/checkpoints/epoch00-val_loss0.13.ckpt
/Users/slegroux/miniforge3/envs/nimrod/lib/python3.11/site-packages/lightning/pytorch/trainer/call.py:273: Be aware that when using `ckpt_path`, callbacks used to create the checkpoint need to be provided during `Trainer` instantiation. Please add the following callbacks: ["ModelCheckpoint{'monitor': 'val/loss', 'mode': 'min', 'every_n_train_steps': 0, 'every_n_epochs': 1, 'train_time_interval': None}"].
[20:56:49] INFO - Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.0001
    maximize: False
    weight_decay: 1e-05
)
[20:56:49] INFO - Scheduler: <torch.optim.lr_scheduler.ReduceLROnPlateau object>
/Users/slegroux/miniforge3/envs/nimrod/lib/python3.11/site-packages/lightning/pytorch/core/optimizer.py:316: The lr scheduler dict contains the key(s) ['monitor'], but the keys will be ignored. You need to call `lr_scheduler.step()` manually in manual optimization.

  | Name         | Type               | Params | Mode 
------------------------------------------------------------
0 | loss         | CrossEntropyLoss   | 0      | train
1 | train_acc    | MulticlassAccuracy | 0      | train
2 | val_acc      | MulticlassAccuracy | 0      | train
3 | test_acc     | MulticlassAccuracy | 0      | train
4 | train_loss   | MeanMetric         | 0      | train
5 | val_loss     | MeanMetric         | 0      | train
6 | test_loss    | MeanMetric         | 0      | train
7 | val_acc_best | MaxMetric          | 0      | train
8 | nnet         | ConvNet            | 110 K  | train
------------------------------------------------------------
110 K     Trainable params
0         Non-trainable params
110 K     Total params
0.440     Total estimated model params size (MB)
39        Modules in train mode
0         Modules in eval mode
Restored all states from the checkpoint at /Users/slegroux/Projects/nimrod/nbs/checkpoints/epoch00-val_loss0.13.ckpt
/Users/slegroux/miniforge3/envs/nimrod/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
/Users/slegroux/miniforge3/envs/nimrod/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
[20:57:08] INFO - scheduler is an instance of Reduce plateau
[20:57:26] INFO - scheduler is an instance of Reduce plateau
`Trainer.fit` stopped: `max_epochs=3` reached.
csv_path = f"{trainer.logger.log_dir}/metrics.csv"
metrics = pd.read_csv(csv_path)
metrics.head()
plt.figure()
plt.plot(metrics['step'], metrics['train/loss_step'], 'b.-')
plt.plot(metrics['step'], metrics['val/loss'],'r.-')
plt.figure()
plt.plot(metrics['step'], metrics['lr-AdamW'], 'g.-')
plt.show()
trainer.test(model, datamodule.test_dataloader(), ckpt_path="best")

Restoring states from the checkpoint path at logs/fashion_mnist_convnet/version_14/checkpoints/epoch=2-step=282.ckpt
Loaded model weights from the checkpoint at logs/fashion_mnist_convnet/version_14/checkpoints/epoch=2-step=282.ckpt
/Users/slegroux/miniforge3/envs/nimrod/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃        Test metric               DataLoader 0        ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│         test/acc              0.9711999893188477     │
│         test/loss             0.11154787242412567    │
└───────────────────────────┴───────────────────────────┘