udf timing

ajz34 · May 22, 2021 · 347c93e · 347c93e
1 parent 7943e50
commit 347c93e
Show file tree

Hide file tree

Showing 5 changed files with 23 additions and 5 deletions.
diff --git a/TODO.md b/TODO.md
@@ -7,7 +7,7 @@
 ## 1. 服务器部署
 
 - [ ] 1-1 需要再验证一下大体系、小内存情况下的数值导数
-- [ ] 1-2 需要检查内存是否控制过于严格或产生溢出 (特别是开壳层)
+- [x] 1-2 需要检查内存是否控制过于严格或产生溢出 (特别是开壳层) **结论：似乎没有严重溢出，但有些过剩。以后还需要微调。**
 - [ ] 1-3 **未完全解决** 确定 cProfile 的工作流程，并确定各函数调用时间与打印方式
 - [x] 1-4 确定较为自动的脚本，使得对于同一输入卡，服务器与本地可以轻松地分别执行大分子与小分子计算
 - [ ] 1-5 确定是否能写队列脚本，是否可以用 Gaussian 输入卡作为 CLI
@@ -88,4 +88,6 @@
 - [x] 8-1 `get_gradient_jk`：并行效率 37/40，无需更改
 - [ ] 8-2 `get_gradient_gga`：并行效率存在问题，但似乎是内存 bandwidth 控制，难以修改代码
 - [ ] 8-3 `get_cderi_mo` 与 `get_cpks_eri`：这些涉及到是否允许 async 读写盘；但目前似乎无法判断程序效。
-          甚至感到使用 async 之后程序效率更低；可能需要询问专家了。
+          甚至感到使用 async 之后程序效率更低；可能需要询问专家了。
+- [ ] 8-4 `Ax0_Core_HF` 与 `Ax0_cpks_HF` 的效率在小体系体现不出问题，但大体系需要关心。
+          还是解决不了异步的问题。
diff --git a/pyscf/dh/dhutil.py b/pyscf/dh/dhutil.py
@@ -167,6 +167,7 @@ def gen_shl_batch(mol, blksize, start_id=0, stop_id=None):
 
 def calc_batch_size(unit_flop, mem_avail, pre_flop=0):
     # mem_avail: in MB
+    print("DEBUG: mem_avail", mem_avail)
     max_memory = 0.8 * mem_avail - pre_flop * 8 / 1024 ** 2
     batch_size = int(max(max_memory // (unit_flop * 8 / 1024 ** 2), 1))
     return batch_size

diff --git a/pyscf/dh/grad/udfdh.py b/pyscf/dh/grad/udfdh.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 from dh import UDFDH
-from dh.dhutil import calc_batch_size, gen_batch, gen_shl_batch, tot_size
+from dh.dhutil import calc_batch_size, gen_batch, gen_shl_batch, tot_size, timing
 from dh.grad.rdfdh import contract_multiple_rho, get_H_1_ao, get_S_1_ao, generator_L_1
 import dh.grad.rdfdh
 from pyscf import gto, lib, df
@@ -14,6 +14,7 @@
 αα, αβ, ββ = 0, 1, 2
 
 
+@timing
 def get_gradient_jk(dfobj: df.DF, C, D, D_r, Y_mo, cx, cx_n, max_memory=2000):
     mol, aux = dfobj.mol, dfobj.auxmol
     natm, nao, nmo, nocc = mol.natm, mol.nao, C.shape[-1], mol.nelec
@@ -105,12 +106,14 @@ def __init__(self, mol: gto.Mole, skip_construct=False, *args, **kwargs):
         self.grad_tot = NotImplemented
         self.de = NotImplemented
 
+    @timing
     def prepare_H_1(self):
         H_1_ao = get_H_1_ao(self.mol)
         H_1_mo = einsum("sup, Auv, svq -> sApq", self.C, H_1_ao, self.C)
         self.tensors.create("H_1_ao", H_1_ao)
         self.tensors.create("H_1_mo", H_1_mo)
 
+    @timing
     def prepare_S_1(self):
         S_1_ao = get_S_1_ao(self.mol)
         S_1_mo = einsum("sup, Auv, svq -> sApq", self.C, S_1_ao, self.C)
@@ -124,6 +127,7 @@ def prepare_gradient_jk(self):
         cx_n = self.cx_n if self.xc_n else self.cx
         self.grad_jk = get_gradient_jk(self.df_jk, self.C, self.D, D_r, Y_mo, self.cx, cx_n, self.get_memory())
 
+    @timing
     def prepare_gradient_gga(self):
         tensors = self.tensors
         if "rho" not in tensors:
@@ -153,6 +157,7 @@ def prepare_gradient_gga(self):
         self.grad_gga = grad_contrib
         return self
 
+    @timing
     def prepare_gradient_pt2(self):
         tensors = self.tensors
         C, D, e = self.C, self.D, self.e
@@ -210,6 +215,7 @@ def prepare_gradient_pt2(self):
                 grad_corr[A] += einsum("Pia, tPia -> t", G_ia_ri[σ], Y_1_ia_ri[σ])
         self.grad_pt2 = grad_corr
 
+    @timing
     def prepare_gradient_enfunc(self):
         tensors = self.tensors
         natm = self.mol.natm

diff --git a/pyscf/dh/rdfdh.py b/pyscf/dh/rdfdh.py
@@ -206,7 +206,6 @@ def load(r0, r1, pre):
     with lib.call_in_background(load) as bload:
         load(0-nbatch, 0, pre_load)
         for sA in gen_batch(nocc, nmo, nbatch):
-            print(sA)
             nA = sA.stop - sA.start
             sAvir = slice(sA.start - nocc, sA.stop - nocc)
             buf_load, pre_load = pre_load, buf_load
@@ -410,6 +409,7 @@ def get_memory(self):  # leave at least 500MB space anyway
         return max(self.max_memory - lib.current_memory()[0], 500)
 
     def calc_batch_size(self, unit_flop, pre_flop=0, fixed_mem=None):
+        print("DEBUG: self.get_memory", self.get_memory())
         if self._fixed_batch:
             return self._fixed_batch
         if fixed_mem:

diff --git a/pyscf/dh/udfdh.py b/pyscf/dh/udfdh.py
@@ -111,6 +111,7 @@ def energy_elec(mf: UDFDH, params=None, **kwargs):
 # region first derivative related
 
 
+@timing
 def get_eri_cpks(Y_mo_jk, nocc, cx, eri_cpks=None, max_memory=2000):
     naux, nmo, _ = Y_mo_jk[0].shape
     nvir = nmo - nocc[α], nmo - nocc[β]
@@ -141,6 +142,7 @@ def Ax0_cpks_HF(eri_cpks, max_memory=2000):
     nocc = eri_cpks[αα].shape[1], eri_cpks[ββ].shape[1]
     mvir, mocc = max(nvir), max(nocc)
 
+    @timing
     def Ax0_cpks_HF_inner(X):
         prop_shape = X[0].shape[:-2]
         X = [X[σ].reshape(-1, X[σ].shape[-2], X[σ].shape[-1]) for σ in (α, β)]
@@ -165,6 +167,7 @@ def Ax0_Core_HF(si, sa, sj, sb, cx, Y_mo_jk, max_memory=2000):
     ni = [si[σ].stop - si[σ].start for σ in (α, β)]
     na = [sa[σ].stop - sa[σ].start for σ in (α, β)]
 
+    @timing
     def Ax0_Core_HF_inner(X):
         prop_shape = X[0].shape[:-2]
         X = [X[σ].reshape(-1, X[σ].shape[-2], X[σ].shape[-1]) for σ in (α, β)]
@@ -189,6 +192,7 @@ def Ax0_Core_KS(si, sa, sj, sb, mo_coeff, xc_setting, xc_kernel):
     ni, mol, grids, xc, dm = xc_setting
     rho, vxc, fxc = xc_kernel
 
+    @timing
     def Ax0_Core_KS_inner(X):
         prop_shape = X[0].shape[:-2]
         X = [X[σ].reshape(-1, X[σ].shape[-2], X[σ].shape[-1]) for σ in (α, β)]
@@ -220,6 +224,7 @@ def __init__(self,
         self.mvir = NotImplemented
         self.mocc = max(max(self.nocc), 1)
 
+    @timing
     def run_scf(self):
         self.mf_s.grids = self.mf_n.grids = self.grids
         self.build()
@@ -285,6 +290,7 @@ def Ax0_cpks_inner(X):
             return [ax0_hf[σ] + ax0_ks[σ] for σ in (α, β)]
         return Ax0_cpks_inner
 
+    @timing
     def solve_cpks(self, rhs):
         nocc, nvir = self.nocc, self.nvir
 
@@ -323,7 +329,8 @@ def prepare_integral(self):
         get_eri_cpks([tensors["Y_mo_jk" + str(σ)] for σ in (α, β)], nocc, self.cx, eri_cpks, self.get_memory())
         return self
 
-    def prepare_pt2(self, dump_t_ijab=True):
+    @timing
+    def prepare_pt2(self, dump_t_ijab=True, fast_trans=True):
         tensors = self.tensors
         nvir, nocc, nmo = self.nvir, self.nocc, self.nmo
         mocc, mvir = max(nocc), max(nvir)
@@ -403,6 +410,7 @@ def prepare_pt2(self, dump_t_ijab=True):
 
         return self
 
+    @timing
     def prepare_lagrangian(self, gen_W=False):
         tensors = self.tensors
         nvir, nocc, nmo, naux = self.nvir, self.nocc, self.nmo, self.df_ri.get_naoaux()
@@ -444,6 +452,7 @@ def prepare_lagrangian(self, gen_W=False):
             tensors.create("L" + str(σ), L[σ])
         return self
 
+    @timing
     def prepare_D_r(self):
         tensors = self.tensors
         sv, so = self.sv, self.so