Updates

+ Update sanitization regex to clean exclaimation points + Update _sanitize method to use _clean method + Moved lecture file exist check to parse method, this will now also check if html files exist + Update chapter folder titles to use - for separating number from name
wvlover15 · Aug 8, 2021 · ca40ff2 · ca40ff2
1 parent 6137e44
commit ca40ff2
Showing 1 changed file with 48 additions and 40 deletions.
diff --git a/main.py b/main.py
@@ -42,15 +42,15 @@
 
 
 def _clean(text):
-    ok = re.compile(r'[^\\/:*?"<>|]')
+    ok = re.compile(r'[^\\/:*?!"<>|]')
     text = "".join(x if ok.match(x) else "_" for x in text)
     text = re.sub(r"\.+$", "", text.strip())
     return text
 
 
 def _sanitize(self, unsafetext):
-    text = sanitize(
-        slugify(unsafetext, lower=False, spaces=True, ok=SLUG_OK + "().[]"))
+    text = _clean(sanitize(
+        slugify(unsafetext, lower=False, spaces=True, ok=SLUG_OK + "().[]")))
     return text
 
 
@@ -1028,22 +1028,17 @@ def process_lecture(lecture, lecture_path, lecture_file_name, quality, access_to
 
     if is_encrypted:
         if len(lecture_sources) > 0:
-            if not os.path.isfile(lecture_path):
-                source = lecture_sources[-1]  # last index is the best quality
-                if isinstance(quality, int):
-                    source = min(
-                        lecture_sources,
-                        key=lambda x: abs(int(x.get("height")) - quality))
-                print(f"      > Lecture '%s' has DRM, attempting to download" %
-                      lecture_title)
-                handle_segments(source.get("download_url"),
-                                source.get(
-                                    "format_id"), lecture_title, lecture_path, lecture_file_name,
-                                concurrent_connections, chapter_dir)
-            else:
-                print(
-                    "      > Lecture '%s' is already downloaded, skipping..." %
-                    lecture_title)
+            source = lecture_sources[-1]  # last index is the best quality
+            if isinstance(quality, int):
+                source = min(
+                    lecture_sources,
+                    key=lambda x: abs(int(x.get("height")) - quality))
+            print(f"      > Lecture '%s' has DRM, attempting to download" %
+                  lecture_title)
+            handle_segments(source.get("download_url"),
+                            source.get(
+                                "format_id"), lecture_title, lecture_path, lecture_file_name,
+                            concurrent_connections, chapter_dir)
         else:
             print(f"      > Lecture '%s' is missing media links" %
                   lecture_title)
@@ -1117,31 +1112,44 @@ def parse_new(_udemy, quality, skip_lectures, dl_assets, dl_captions,
         for lecture in chapter.get("lectures"):
             lecture_title = lecture.get("lecture_title")
             lecture_index = lecture.get("lecture_index")
+            lecture_extension = lecture.get("extension")
+            extension = "mp4"  # video lectures dont have an extension property, so we assume its mp4
+            if lecture_extension != None:
+                # if the lecture extension property isnt none, set the extension to the lecture extension
+                extension = lecture_extension
+            lecture_file_name = sanitize(lecture_title + "." + extension)
+            lecture_path = os.path.join(
+                chapter_dir,
+                lecture_file_name)
 
-            extension = lecture.get("extension")
             print(
                 f"  > Processing lecture {lecture_index} of {total_lectures}")
             if not skip_lectures:
-                if extension == "html":
-                    html_content = lecture.get("html_content").encode(
-                        "ascii", "ignore").decode("utf8")
-                    lecture_path = os.path.join(
-                        chapter_dir, "{}.html".format(sanitize(lecture_title)))
-                    try:
-                        with open(lecture_path, 'w') as f:
-                            f.write(html_content)
-                            f.close()
-                    except Exception as e:
-                        print("    > Failed to write html file: ", e)
-                        continue
+                print(lecture_file_name)
+                # Check if the lecture is already downloaded
+                if os.path.isfile(lecture_path):
+                    print(
+                        "      > Lecture '%s' is already downloaded, skipping..." %
+                        lecture_title)
+                    continue
                 else:
-                    lecture_file_name = sanitize(lecture_title + ".mp4")
-                    lecture_path = os.path.join(
-                        chapter_dir,
-                        lecture_file_name)
-                    process_lecture(lecture, lecture_path, lecture_file_name,
-                                    quality, access_token,
-                                    concurrent_connections, chapter_dir)
+                    # Check if the file is an html file
+                    if extension == "html":
+                        html_content = lecture.get("html_content").encode(
+                            "ascii", "ignore").decode("utf8")
+                        lecture_path = os.path.join(
+                            chapter_dir, "{}.html".format(sanitize(lecture_title)))
+                        try:
+                            with open(lecture_path, 'w') as f:
+                                f.write(html_content)
+                                f.close()
+                        except Exception as e:
+                            print("    > Failed to write html file: ", e)
+                            continue
+                    else:
+                        process_lecture(lecture, lecture_path, lecture_file_name,
+                                        quality, access_token,
+                                        concurrent_connections, chapter_dir)
 
             if dl_assets:
                 assets = lecture.get("assets")
@@ -1521,7 +1529,7 @@ def get_version_string():
                     lecture_counter = 0
                     lectures = []
                     chapter_index = entry.get("object_index")
-                    chapter_title = "{0:02d} ".format(chapter_index) + _clean(
+                    chapter_title = "{0:02d} - ".format(chapter_index) + _clean(
                         entry.get("title"))
 
                     if chapter_title not in _udemy["chapters"]: