Update we heart it

einverne · Jan 8, 2019 · 63106e3 · 63106e3
1 parent 6d5fb96
commit 63106e3
Show file tree

Hide file tree

Showing 4 changed files with 68 additions and 19 deletions.
diff --git a/README.md b/README.md
@@ -28,6 +28,10 @@ Introduction
 4. 电影剧照
 
         douban-dl https://movie.douban.com/subject/26804147
+        
+5. We Heart It 
+
+        douban-dl https://weheartit.com/ma_yu_scandal/collections/106301429-
 
 Installation
 ------------
@@ -50,6 +54,12 @@ Usage
 
 如果没有指定 `path` ，默认会保存到当前目录下 douban 文件夹中。
 
+更新日志
+-------
+
+2019-01-08 增加 We Heart It 支持
+
+
 License
 -------
 

diff --git a/douban/__main__.py b/douban/__main__.py
@@ -8,6 +8,7 @@
 from douban.celebrity import get_celebrity_by_id
 from douban.movie import get_movie_by_id
 from douban.people import People
+from douban.weheartit import get_collection_photos
 
 
 def get_args():
@@ -35,28 +36,33 @@ def parse_url(url, path):
     :param path: 下载路径
     :return:
     """
-    match = re.match(r'https?://www.douban.com/photos/album/(\d+)', url)
+    match = re.match(r'^https?://www.douban.com/photos/album/(\d+)$', url)
     if match:
         album_id = match.group(1)
         get_album_by_id(album_id, path)
         return
-    match = re.match(r'https?://movie.douban.com/celebrity/(\d+)', url)
+    match = re.match(r'^https?://movie.douban.com/celebrity/(\d+)$', url)
     if match:
         celebrity_id = match.group(1)
         get_celebrity_by_id(celebrity_id, path)
         return
-    match = re.match(r'https?://www.douban.com/people/(\w+)(/|/photos)', url)
+    match = re.match(r'^https?://www.douban.com/people/(\w+)(/|/photos)$', url)
     if match:
         people_id = match.group(1)
         people = People(people_id)
         for album_id in people.albums():
             get_album_by_id(album_id, os.path.join(path, album_id))
         return
-    match = re.match(r'https?://movie.douban.com/subject/(\d+)', url)
+    match = re.match(r'^https?://movie.douban.com/subject/(\d+)$', url)
     if match:
         movie_id = match.group(1)
         get_movie_by_id(movie_id, path)
         return
+    match = re.match(r'^https?://weheartit.com/([a-zA-Z0-9\_\-]+)/collections/([0-9a-zA-z\-]+)$', url)
+    if match:
+        username = match.group(1)
+        collection_id = match.group(2)
+        get_collection_photos(username, collection_id, path)
     print("Not support this url yet")
 
 

diff --git a/douban/weheartit.py b/douban/weheartit.py
@@ -1,9 +1,13 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
+import os
 
 import requests
 from bs4 import BeautifulSoup
 
+from douban import threadPoolExecutor
+from utils import number1, file_utils
+
 headers = {
     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36'}
 
@@ -16,33 +20,54 @@ def __init__(self, user_id, collection_id) -> None:
 
     def photos(self):
         """:return photo urls"""
-        pass
+        page = 1
+        before = ''
+        while True:
+            photos = self.__photos(page, before)
+            before = number1(photos[-1])
+            length = len(photos)
+            if length == 0:
+                break
+            for photo in photos:
+                yield photo
+            page += 1
 
-    def __photos(self, page):
+    def __photos(self, page, before):
         r = requests.get(self.url, params={
             'scrolling': 'true',
             'page': page,
-            'before': ''
+            'before': before
+        }, headers={
+            'Host': 'weheartit.com',
+            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
         })
         soup = BeautifulSoup(r.content, 'html.parser')
         imgs = soup.find_all('img', class_='entry-thumbnail')
+        photos = []
         for img in imgs:
             url = img['src']
-            url.replace('superthumb', 'original')
+            url = url.replace('superthumb', 'original')
             url = url.split('?', 1)[0]
-            yield url
+            photos.append(url)
+        return photos
 
 
-def get_page():
-    """
-    https://weheartit.com/ma_yu_scandal/collections/106301429-?scrolling=true&page=2&before=298859538
-    """
-    url = 'https://weheartit.com/ma_yu_scandal/collections/106301429-'
-    r = requests.get(url, headers={
-        'Host': 'weheartit.com',
-        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
-    })
+def get_collection_photos(username, collection_id, path):
+    heart = WeHeartIt(username, collection_id)
+    idx = 0
+    if not os.path.exists(path):
+        os.mkdir(path)
+    for photo_url in heart.photos():
+        photo_name = number1(photo_url) + '.' + photo_url.split('.')[-1]
+        full_path = os.path.join(path, photo_name)
+        threadPoolExecutor.submit(file_utils.save_from_url, url=photo_url,
+                                  headers={
+                                      'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
+                                  },
+                                  name=full_path,
+                                  index=idx)
+        idx += 1
 
 
 if __name__ == '__main__':
-    we_heart_it = WeHeartIt('ma_yu_scandal', '106301429-')
+    we_heart_it = get_collection_photos('ma_yu_scandal', '106301429-', '/home/einverne/Pictures/weheartid')
diff --git a/utils/__init__.py b/utils/__init__.py
@@ -1,2 +1,10 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
+
+import re
+
+
+def number1(s):
+    m = re.search('\d+', s)
+    if m:
+        return m.group(0)