Skip to content

Commit

Permalink
Add douban people support
Browse files Browse the repository at this point in the history
like https://www.douban.com/people/<douban_id>/photos

Signed-off-by: Ein Verne <einverne@gmail.com>
  • Loading branch information
einverne committed Aug 3, 2017
1 parent fe58e98 commit 3d3a739
Show file tree
Hide file tree
Showing 4 changed files with 62 additions and 4 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ Usage

https://www.douban.com/photos/album/<album_id>
https://movie.douban.com/celebrity/<celebrity_id>
https://www.douban.com/people/<douban_id>/photos

`path` is the folder where images saved, defaults to `./douban`.

Expand Down
18 changes: 14 additions & 4 deletions douban/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from douban.celebrity import Celebrity
from douban.douban_album_dl import get_album
from douban.douban_celebrity_dl import get_celebrity
from douban.people import People


def get_args():
Expand All @@ -29,21 +30,30 @@ def parse_url(url, path):
"""
https://www.douban.com/photos/album/<album_id>
https://movie.douban.com/celebrity/<celebrity_id>
https://www.douban.com/people/<douban_id>/photos
:param url:
:return:
"""
match = re.match(r'(http|https)://www.douban.com/photos/album/(\d+)', url)
match = re.match(r'https?://www.douban.com/photos/album/(\d+)', url)
if match:
album_id = match.group(2)
album_id = match.group(1)
album = Album(album_id)
get_album(album, path)
return
match = re.match(r'(http|https)://movie.douban.com/celebrity/(\d+)', url)
match = re.match(r'https?://movie.douban.com/celebrity/(\d+)', url)
if match:
celebrity_id = match.group(2)
celebrity_id = match.group(1)
celebrity = Celebrity(celebrity_id)
get_celebrity(celebrity, path)
return
match = re.match(r'https?://www.douban.com/people/(\w+)(/|/photos)', url)
if match:
people_id = match.group(1)
people = People(people_id)
for album_id in people.albums():
a = Album(album_id)
get_album(a, path + '/' + album_id)
return
print("Not support this url yet")


Expand Down
2 changes: 2 additions & 0 deletions douban/douban_album_dl.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

def get_album(album, path):
idx = 0
pwd = os.getcwd()
file_utils.mkdir(path)
os.chdir(path)
for photo_url in album.photos():
Expand All @@ -23,3 +24,4 @@ def get_album(album, path):
file_utils.save_from_url(photo_url, headers, name)
idx += 1
print("saving album to {}, total {} images".format(path, idx))
os.chdir(pwd)
45 changes: 45 additions & 0 deletions douban/people.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
import re
import requests


class People:
BASE_URL = 'https://www.douban.com/people/'

def __init__(self, user_id):
self.uid = user_id
self.url = People.BASE_URL + '{}/photos?start='.format(user_id)

def albums(self):
start = 0
while True:
next_albums = self.__album(start)
step = len(next_albums)
if 0 == step:
break
for album_id in next_albums:
yield album_id
start += step

def __album(self, start):
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.4',
'Connection': 'keep-alive',
'DNT': '1',
'HOST': 'www.douban.com',
'Referer': People.BASE_URL + self.uid,
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
}
url = self.url + str(start)
r = requests.get(url, headers=headers)
album_urls = re.findall(r'https?://www.douban.com/photos/album/(\d+)', r.text)
return set(album_urls)


if __name__ == '__main__':
lordbean = People('LordBean')
for album_id in lordbean.albums():
print(album_id)

0 comments on commit 3d3a739

Please sign in to comment.