Bug fix in backend parallelization that could lead to issues when entire…

Bug fix in backend parallelization that could lead to issues when entire sublists failed to generate a structure.

Bug fix in backend parallelization that could lead to issues when entire…
Bug fix in backend parallelization that could lead to issues when entire sublists failed to generate a structure.
2f46839b · Christian Margreitter · 7bdfd4a6 · 2f46839b · 2f46839b · 2f46839b
Commit 2f46839b authored Sep 14, 2021 by Christian Margreitter
8 changed files
--- a/dockstream/core/AutodockVina/AutodockVina_docker.py
+++ b/dockstream/core/AutodockVina/AutodockVina_docker.py
@@ -178,12 +178,12 @@ class AutodockVina(Docker, BaseModel):
        if not os.path.exists(self.parameters.receptor_pdbqt_path[0]):
            raise DockingRunFailed("Specified PDBQT path to target (receptor) does not exist - abort.")

-        jobs_submitted = 0
+        sublists_submitted = 0
        slices_per_iteration = min(number_cores, number_sublists)
-        while jobs_submitted < len(sublists):
-            upper_bound_slice = min((jobs_submitted + slices_per_iteration), len(sublists))
-            cur_slice_start_indices = start_indices[jobs_submitted:upper_bound_slice]
-            cur_slice_sublists = sublists[jobs_submitted:upper_bound_slice]
+        while sublists_submitted < len(sublists):
+            upper_bound_slice = min((sublists_submitted + slices_per_iteration), len(sublists))
+            cur_slice_start_indices = start_indices[sublists_submitted:upper_bound_slice]
+            cur_slice_sublists = sublists[sublists_submitted:upper_bound_slice]

            # generate paths and initialize molecules (so that if they fail, this can be covered)
            tmp_output_dirs, tmp_input_paths, tmp_output_paths, \
@@ -197,10 +197,13 @@ class AutodockVina(Docker, BaseModel):
                                                                            tmp_output_paths[chunk_index]))
                processes.append(p)
                p.start()
-                jobs_submitted += 1
            for p in processes:
                p.join()

+            # add the number of input sublists rather than the output temporary folders to account for cases where
+            # entire sublists failed to produce an input structure
+            sublists_submitted += len(cur_slice_sublists)
+
            # parse the resulting sdf files
            for path_sdf_results, cur_identifier in zip(tmp_output_paths, ligand_identifiers):
                # add conformations
@@ -226,7 +229,7 @@ class AutodockVina(Docker, BaseModel):
            # clean-up
            for path in tmp_output_dirs:
                shutil.rmtree(path)
-            self._log_docking_progress(number_done=jobs_submitted, number_total=number_sublists)
+            self._log_docking_progress(number_done=sublists_submitted, number_total=number_sublists)

        # the conformers are already sorted, but some tags are missing
        # -> <ligand_number>:<enumeration>:<conformer_number>

--- a/dockstream/core/Gold/Gold_docker.py
+++ b/dockstream/core/Gold/Gold_docker.py
@@ -202,13 +202,13 @@ class Gold(Docker):
        start_indices, sublists = self.get_sublists_for_docking(number_cores=number_cores)
        number_sublists = len(sublists)
        self._logger.log(f"Split ligands into {number_sublists} sublists for docking.", _LE.DEBUG)
-        jobs_submitted = 0
+        sublists_submitted = 0
        slices_per_iteration = min(number_cores, number_sublists)

-        while jobs_submitted < len(sublists):
-            upper_bound_slice = min((jobs_submitted + slices_per_iteration), len(sublists))
-            cur_slice_start_indices = start_indices[jobs_submitted:upper_bound_slice]
-            cur_slice_sublists = sublists[jobs_submitted:upper_bound_slice]
+        while sublists_submitted < len(sublists):
+            upper_bound_slice = min((sublists_submitted + slices_per_iteration), len(sublists))
+            cur_slice_start_indices = start_indices[sublists_submitted:upper_bound_slice]
+            cur_slice_sublists = sublists[sublists_submitted:upper_bound_slice]

            # generate paths and initialize molecules (so that if they fail, this can be covered)
            tmp_output_dirs, tmp_input_sdf_paths, \
@@ -223,10 +223,13 @@ class Gold(Docker):
                                                                            tmp_output_dirs[chunk_index]))
                processes.append(p)
                p.start()
-                jobs_submitted += 1
            for p in processes:
                p.join()

+            # add the number of input sublists rather than the output temporary folders to account for cases where
+            # entire sublists failed to produce an input structure
+            sublists_submitted += len(cur_slice_sublists)
+
            # load the chunks and recombine the result; add conformations
            for chunk_index in range(len(tmp_output_dirs)):
                # this is a protection against the case where empty (file size == 0 bytes) files are generated due to
@@ -253,7 +256,7 @@ class Gold(Docker):
            # clean-up
            for path in tmp_output_dirs:
                shutil.rmtree(path)
-            self._log_docking_progress(number_done=jobs_submitted, number_total=number_sublists)
+            self._log_docking_progress(number_done=sublists_submitted, number_total=number_sublists)

        # update conformer names to contain the conformer id
        # -> <ligand_number>:<enumeration>:<conformer_number>

--- a/dockstream/core/OpenEye/OpenEye_docker.py
+++ b/dockstream/core/OpenEye/OpenEye_docker.py
@@ -141,20 +141,20 @@ class OpenEye(Docker):
        number_sublists = len(sublists)
        self._logger.log(f"Split ligands into {len(sublists)} sublists for docking.", _LE.DEBUG)

-        jobs_submitted = 0
-        while jobs_submitted < len(sublists):
+        sublists_submitted = 0
+        while sublists_submitted < len(sublists):
            processes = []
            return_queues = []
            for _ in range(number_cores):
-                if jobs_submitted >= len(sublists):
+                if sublists_submitted >= len(sublists):
                    continue
                cur_queue = multiprocessing.Queue()
-                p = multiprocessing.Process(target=self._dock_subjob, args=(sublists[jobs_submitted],
+                p = multiprocessing.Process(target=self._dock_subjob, args=(sublists[sublists_submitted],
                                                                            cur_queue))
                processes.append(p)
                p.start()
                return_queues.append(cur_queue)
-                jobs_submitted += 1
+                sublists_submitted += 1
            for p in processes:
                p.join()

@@ -164,7 +164,7 @@ class OpenEye(Docker):
                        if cur_ligand_name == ligand.get_identifier():
                            ligand.set_conformers(cur_slice[cur_ligand_name])
                            break
-            self._log_docking_progress(number_done=jobs_submitted, number_total=number_sublists)
+            self._log_docking_progress(number_done=sublists_submitted, number_total=number_sublists)

        # update conformer names to contain the conformer id -> <ligand_number>:<enumeration>:<conformer_number>
        for ligand in self.ligands:

--- a/dockstream/core/OpenEyeHybrid/Omega_ligand_preparator.py
+++ b/dockstream/core/OpenEyeHybrid/Omega_ligand_preparator.py
@@ -185,13 +185,13 @@ class OmegaLigandPreparator(LigandPreparator, BaseModel):
        number_sublists = len(sublists)
        self._logger.log(f"Split ligands into {number_sublists} sublists for embedding.", _LE.DEBUG)

-        jobs_submitted = 0
+        sublists_submitted = 0
        slices_per_iteration = min(number_cores, number_sublists)
        if isinstance(self.ligands[0].get_molecule(), Chem.Mol):
-            while jobs_submitted < len(sublists):
-                upper_bound_slice = min((jobs_submitted + slices_per_iteration), len(sublists))
-                cur_slice_start_indices = start_indices[jobs_submitted:upper_bound_slice]
-                cur_slice_sublists = sublists[jobs_submitted:upper_bound_slice]
+            while sublists_submitted < len(sublists):
+                upper_bound_slice = min((sublists_submitted + slices_per_iteration), len(sublists))
+                cur_slice_start_indices = start_indices[sublists_submitted:upper_bound_slice]
+                cur_slice_sublists = sublists[sublists_submitted:upper_bound_slice]

                # generate paths and initialize molecules (so that if they fail, this can be covered)
                tmp_output_dirs, tmp_input_smi_paths, \
@@ -206,10 +206,13 @@ class OmegaLigandPreparator(LigandPreparator, BaseModel):
                                                      tmp_output_dirs[chunk_index]))
                    processes.append(p)
                    p.start()
-                    jobs_submitted += 1
                for p in processes:
                    p.join()

+                # add the number of input sublists rather than the output temporary folders to account for cases where
+                # entire sublists failed to produce an input structure
+                sublists_submitted += len(cur_slice_sublists)
+
                # load and store the conformers; name it sequentially
                # note, that some backends require the H-coordinates (such as Glide) - so keep them!
                ligands_embedded = []
@@ -262,7 +265,7 @@ class OmegaLigandPreparator(LigandPreparator, BaseModel):
                # remove temporary files
                for path in tmp_output_dirs:
                    shutil.rmtree(path)
-                self._log_docking_progress(number_done=jobs_submitted, number_total=number_sublists)
+                self._log_docking_progress(number_done=sublists_submitted, number_total=number_sublists)

        # check success and failure with embedding
        failed = 0

--- a/dockstream/core/OpenEyeHybrid/OpenEyeHybrid_docker.py
+++ b/dockstream/core/OpenEyeHybrid/OpenEyeHybrid_docker.py
@@ -133,13 +133,13 @@ class OpenEyeHybrid(Docker):
        start_indices, sublists = self.get_sublists_for_docking(number_cores=number_cores)
        number_sublists = len(sublists)
        self._logger.log(f"Split ligands into {number_sublists} sublists for docking.", _LE.DEBUG)
-        jobs_submitted = 0
+        sublists_submitted = 0
        slices_per_iteration = min(number_cores, number_sublists)

-        while jobs_submitted < len(sublists):
-            upper_bound_slice = min((jobs_submitted + slices_per_iteration), len(sublists))
-            cur_slice_start_indices = start_indices[jobs_submitted:upper_bound_slice]
-            cur_slice_sublists = sublists[jobs_submitted:upper_bound_slice]
+        while sublists_submitted < len(sublists):
+            upper_bound_slice = min((sublists_submitted + slices_per_iteration), len(sublists))
+            cur_slice_start_indices = start_indices[sublists_submitted:upper_bound_slice]
+            cur_slice_sublists = sublists[sublists_submitted:upper_bound_slice]

            # generate paths and initialize molecules (so that if they fail, this can be covered)
            tmp_output_dirs, tmp_input_sdf_paths, \
@@ -154,10 +154,13 @@ class OpenEyeHybrid(Docker):
                                                                            tmp_output_dirs[chunk_index]))
                processes.append(p)
                p.start()
-                jobs_submitted += 1
            for p in processes:
                p.join()

+            # add the number of input sublists rather than the output temporary folders to account for cases where
+            # entire sublists failed to produce an input structure
+            sublists_submitted += len(cur_slice_sublists)
+
            # load the chunks and recombine the result; add conformations
            for chunk_index in range(len(tmp_output_dirs)):
                # this is a protection against the case where empty (file size == 0 bytes) files are generated due to
@@ -180,7 +183,7 @@ class OpenEyeHybrid(Docker):
            # clean-up
            for path in tmp_output_dirs:
                shutil.rmtree(path)
-            self._log_docking_progress(number_done=jobs_submitted, number_total=number_sublists)
+            self._log_docking_progress(number_done=sublists_submitted, number_total=number_sublists)

        # sort the conformers (best to worst), update their names to contain the conformer id and add tags
        # -> <ligand_number>:<enumeration>:<conformer_number>

--- a/dockstream/core/Schrodinger/Glide_docker.py
+++ b/dockstream/core/Schrodinger/Glide_docker.py
@@ -398,13 +398,13 @@ class Glide(Docker, BaseModel):
        number_sublists = len(sublists)
        number_ligands_per_sublist = len(sublists[0])
        self._logger.log(f"Split ligands into {number_sublists} sublists for docking.", _LE.DEBUG)
-        jobs_submitted = 0
+        sublists_submitted = 0
        slices_per_iteration = min(number_cores, number_sublists)

-        while jobs_submitted < len(sublists):
-            upper_bound_slice = min((jobs_submitted + slices_per_iteration), len(sublists))
-            cur_slice_start_indices = start_indices[jobs_submitted:upper_bound_slice]
-            cur_slice_sublists = sublists[jobs_submitted:upper_bound_slice]
+        while sublists_submitted < len(sublists):
+            upper_bound_slice = min((sublists_submitted + slices_per_iteration), len(sublists))
+            cur_slice_start_indices = start_indices[sublists_submitted:upper_bound_slice]
+            cur_slice_sublists = sublists[sublists_submitted:upper_bound_slice]

            # generate paths and initialize molecules (so that if they fail, this can be covered)
            tmp_output_dirs, tmp_input_mae_paths, \
@@ -424,10 +424,13 @@ class Glide(Docker, BaseModel):
                                                                            number_ligands_per_sublist))
                processes.append(p)
                p.start()
-                jobs_submitted += 1
            for p in processes:
                p.join()

+            # add the number of input sublists rather than the output temporary folders to account for cases where
+            # entire sublists failed to produce an input structure
+            sublists_submitted += len(cur_slice_sublists)
+
            # parse the resulting sdf files
            for path_sdf_results in tmp_output_sdf_paths:
                # this is a protection against the case where empty (file size == 0 bytes) files are generated due to
@@ -449,7 +452,7 @@ class Glide(Docker, BaseModel):
            # clean-up
            for path in tmp_output_dirs:
                shutil.rmtree(path)
-            self._log_docking_progress(number_done=jobs_submitted, number_total=number_sublists)
+            self._log_docking_progress(number_done=sublists_submitted, number_total=number_sublists)

        # sort the conformers (best to worst) and update their names to contain the conformer id
        # -> <ligand_number>:<enumeration>:<conformer_number>

--- a/dockstream/core/Schrodinger/Ligprep_ligand_preparator.py
+++ b/dockstream/core/Schrodinger/Ligprep_ligand_preparator.py
@@ -254,12 +254,12 @@ class LigprepLigandPreparator(LigandPreparator, BaseModel):
        self._logger.log(f"Split ligands into {number_sublists} sublists for embedding.",
                         _LE.DEBUG)

-        jobs_submitted = 0
+        sublists_submitted = 0
        slices_per_iteration = min(number_cores, number_sublists)
-        while jobs_submitted < len(sublists):
-            upper_bound_slice = min((jobs_submitted + slices_per_iteration), len(sublists))
-            cur_slice_start_indices = start_indices[jobs_submitted:upper_bound_slice]
-            cur_slice_sublists = sublists[jobs_submitted:upper_bound_slice]
+        while sublists_submitted < len(sublists):
+            upper_bound_slice = min((sublists_submitted + slices_per_iteration), len(sublists))
+            cur_slice_start_indices = start_indices[sublists_submitted:upper_bound_slice]
+            cur_slice_sublists = sublists[sublists_submitted:upper_bound_slice]

            # generate paths and initialize molecules (so that if they fail, this can be covered)
            tmp_output_dirs, tmp_input_smi_paths, \
@@ -279,10 +279,13 @@ class LigprepLigandPreparator(LigandPreparator, BaseModel):
                                                                                     tmp_input_filter_paths[chunk_index]))
                processes.append(p)
                p.start()
-                jobs_submitted += 1
            for p in processes:
                p.join()

+            # add the number of input sublists rather than the output temporary folders to account for cases where
+            # entire sublists failed to produce an input structure
+            sublists_submitted += len(cur_slice_sublists)
+
            # load and store the conformers; name it sequentially
            # note, that some backends require the H-coordinates (such as Glide) - so keep them!
            ligands_embedded = []
@@ -307,7 +310,7 @@ class LigprepLigandPreparator(LigandPreparator, BaseModel):
            # remove temporary files
            for path in tmp_output_dirs:
                shutil.rmtree(path)
-            self._log_docking_progress(number_done=jobs_submitted, number_total=number_sublists)
+            self._log_docking_progress(number_done=sublists_submitted, number_total=number_sublists)

        # check success and failure with embedding
        failed = 0

--- a/dockstream/core/rDock/rDock_docker.py
+++ b/dockstream/core/rDock/rDock_docker.py
@@ -120,12 +120,12 @@ class rDock(Docker):
        number_sublists = len(sublists)
        self._logger.log(f"Split ligands into {number_sublists} sublists for docking.", _LE.DEBUG)

-        jobs_submitted = 0
+        sublists_submitted = 0
        slices_per_iteration = min(number_cores, number_sublists)
-        while jobs_submitted < len(sublists):
-            upper_bound_slice = min((jobs_submitted + slices_per_iteration), len(sublists))
-            cur_slice_start_indices = start_indices[jobs_submitted:upper_bound_slice]
-            cur_slice_sublists = sublists[jobs_submitted:upper_bound_slice]
+        while sublists_submitted < len(sublists):
+            upper_bound_slice = min((sublists_submitted + slices_per_iteration), len(sublists))
+            cur_slice_start_indices = start_indices[sublists_submitted:upper_bound_slice]
+            cur_slice_sublists = sublists[sublists_submitted:upper_bound_slice]

            # generate paths and initialize molecules (so that if they fail, this can be covered)
            tmp_output_dirs, tmp_input_sdf_paths, \
@@ -140,10 +140,13 @@ class rDock(Docker):
                                                                            tmp_output_sdf_paths[chunk_index]))
                processes.append(p)
                p.start()
-                jobs_submitted += 1
            for p in processes:
                p.join()

+            # add the number of input sublists rather than the output temporary folders to account for cases where
+            # entire sublists failed to produce an input structure
+            sublists_submitted += len(cur_slice_sublists)
+
            # load the chunks and recombine the result; add conformations
            for chunk_index in range(len(tmp_output_dirs)):
                if not os.path.isfile(tmp_output_sdf_paths[chunk_index]) or os.path.getsize(tmp_output_sdf_paths[chunk_index]) == 0:
@@ -165,7 +168,7 @@ class rDock(Docker):
            # clean-up
            for path in tmp_output_dirs:
                shutil.rmtree(path)
-            self._log_docking_progress(number_done=jobs_submitted, number_total=number_sublists)
+            self._log_docking_progress(number_done=sublists_submitted, number_total=number_sublists)

        # sort the conformers (best to worst), update their names to contain the conformer id and add tags
        # -> <ligand_number>:<enumeration>:<conformer_number>