Skip to content

Commit

Permalink
Update we heart it
Browse files Browse the repository at this point in the history
  • Loading branch information
einverne committed Jan 8, 2019
1 parent 6d5fb96 commit 63106e3
Show file tree
Hide file tree
Showing 4 changed files with 68 additions and 19 deletions.
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ Introduction
4. 电影剧照

douban-dl https://movie.douban.com/subject/26804147
5. We Heart It

douban-dl https://weheartit.com/ma_yu_scandal/collections/106301429-

Installation
------------
Expand All @@ -50,6 +54,12 @@ Usage

如果没有指定 `path` ,默认会保存到当前目录下 douban 文件夹中。

更新日志
-------

2019-01-08 增加 We Heart It 支持


License
-------

Expand Down
14 changes: 10 additions & 4 deletions douban/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from douban.celebrity import get_celebrity_by_id
from douban.movie import get_movie_by_id
from douban.people import People
from douban.weheartit import get_collection_photos


def get_args():
Expand Down Expand Up @@ -35,28 +36,33 @@ def parse_url(url, path):
:param path: 下载路径
:return:
"""
match = re.match(r'https?://www.douban.com/photos/album/(\d+)', url)
match = re.match(r'^https?://www.douban.com/photos/album/(\d+)$', url)
if match:
album_id = match.group(1)
get_album_by_id(album_id, path)
return
match = re.match(r'https?://movie.douban.com/celebrity/(\d+)', url)
match = re.match(r'^https?://movie.douban.com/celebrity/(\d+)$', url)
if match:
celebrity_id = match.group(1)
get_celebrity_by_id(celebrity_id, path)
return
match = re.match(r'https?://www.douban.com/people/(\w+)(/|/photos)', url)
match = re.match(r'^https?://www.douban.com/people/(\w+)(/|/photos)$', url)
if match:
people_id = match.group(1)
people = People(people_id)
for album_id in people.albums():
get_album_by_id(album_id, os.path.join(path, album_id))
return
match = re.match(r'https?://movie.douban.com/subject/(\d+)', url)
match = re.match(r'^https?://movie.douban.com/subject/(\d+)$', url)
if match:
movie_id = match.group(1)
get_movie_by_id(movie_id, path)
return
match = re.match(r'^https?://weheartit.com/([a-zA-Z0-9\_\-]+)/collections/([0-9a-zA-z\-]+)$', url)
if match:
username = match.group(1)
collection_id = match.group(2)
get_collection_photos(username, collection_id, path)
print("Not support this url yet")


Expand Down
55 changes: 40 additions & 15 deletions douban/weheartit.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os

import requests
from bs4 import BeautifulSoup

from douban import threadPoolExecutor
from utils import number1, file_utils

headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36'}

Expand All @@ -16,33 +20,54 @@ def __init__(self, user_id, collection_id) -> None:

def photos(self):
""":return photo urls"""
pass
page = 1
before = ''
while True:
photos = self.__photos(page, before)
before = number1(photos[-1])
length = len(photos)
if length == 0:
break
for photo in photos:
yield photo
page += 1

def __photos(self, page):
def __photos(self, page, before):
r = requests.get(self.url, params={
'scrolling': 'true',
'page': page,
'before': ''
'before': before
}, headers={
'Host': 'weheartit.com',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
})
soup = BeautifulSoup(r.content, 'html.parser')
imgs = soup.find_all('img', class_='entry-thumbnail')
photos = []
for img in imgs:
url = img['src']
url.replace('superthumb', 'original')
url = url.replace('superthumb', 'original')
url = url.split('?', 1)[0]
yield url
photos.append(url)
return photos


def get_page():
"""
https://weheartit.com/ma_yu_scandal/collections/106301429-?scrolling=true&page=2&before=298859538
"""
url = 'https://weheartit.com/ma_yu_scandal/collections/106301429-'
r = requests.get(url, headers={
'Host': 'weheartit.com',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
})
def get_collection_photos(username, collection_id, path):
heart = WeHeartIt(username, collection_id)
idx = 0
if not os.path.exists(path):
os.mkdir(path)
for photo_url in heart.photos():
photo_name = number1(photo_url) + '.' + photo_url.split('.')[-1]
full_path = os.path.join(path, photo_name)
threadPoolExecutor.submit(file_utils.save_from_url, url=photo_url,
headers={
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
},
name=full_path,
index=idx)
idx += 1


if __name__ == '__main__':
we_heart_it = WeHeartIt('ma_yu_scandal', '106301429-')
we_heart_it = get_collection_photos('ma_yu_scandal', '106301429-', '/home/einverne/Pictures/weheartid')
8 changes: 8 additions & 0 deletions utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,10 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import re


def number1(s):
m = re.search('\d+', s)
if m:
return m.group(0)

0 comments on commit 63106e3

Please sign in to comment.