forked from andrewferguson/YahooGroups-Archiver
-
Notifications
You must be signed in to change notification settings - Fork 1
/
archive_group.py
143 lines (123 loc) · 5.61 KB
/
archive_group.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
'''
Yahoo-Groups-Archiver Copyright 2015, 2017, 2018 Andrew Ferguson and others
YahooGroups-Archiver, a simple python script that allows for all
messages in a public Yahoo Group to be archived.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
'''
cookie_T = 'COOKIE_T_DATA_GOES_HERE'
cookie_Y = 'COOKIE_Y_DATA_GOES_HERE'
import json #required for reading various JSON attributes from the content
import requests #required for fetching the raw messages
import os #required for checking if a file exists locally
import time #required if Yahoo blocks access temporarily (to wait)
import sys #required to cancel script if blocked by Yahoo
import shutil #required for deletung an old folder
import glob #required to find the most recent message downloaded
import time #required to log the date and time of run
def archive_group(groupName, mode="update"):
log("\nArchiving group '" + groupName + "', mode: " + mode + " , on " + time.strftime("%c"), groupName)
startTime = time.time()
msgsArchived = 0
if mode == "retry":
#don't archive any messages we already have
#but try to archive ones that we don't, and may have
#already attempted to archive
min = 1
elif mode == "update":
#start archiving at the last+1 message message we archived
mostRecent = 1
if os.path.exists(groupName):
oldDir = os.getcwd()
os.chdir(groupName)
for file in glob.glob("*.json"):
if int(file[0:-5]) > mostRecent:
mostRecent = int(file[0:-5])
os.chdir(oldDir)
min = mostRecent
elif mode == "restart":
#delete all previous archival attempts and archive everything again
if os.path.exists(groupName):
shutil.rmtree(groupName)
min = 1
else:
print ("You have specified an invalid mode (" + mode + ").")
print ("Valid modes are:\nupdate - add any new messages to the archive\nretry - attempt to get all messages that are not in the archive\nrestart - delete archive and start from scratch")
sys.exit()
if not os.path.exists(groupName):
os.makedirs(groupName)
max = group_messages_max(groupName)
for x in range(min,max+1):
if not os.path.isfile(groupName + '/' + str(x) + ".json"):
print ("Archiving message " + str(x) + " of " + str(max))
sucsess = archive_message(groupName, x)
if sucsess == True:
msgsArchived = msgsArchived + 1
log("Archive finished, archived " + str(msgsArchived) + ", time taken is " + str(time.time() - startTime) + " seconds", groupName)
def group_messages_max(groupName):
s = requests.Session()
resp = s.get('https://groups.yahoo.com/api/v1/groups/' + groupName + '/messages?count=1&sortOrder=desc&direction=-1', cookies={'T': cookie_T, 'Y': cookie_Y})
try:
pageHTML = resp.text
pageJson = json.loads(pageHTML)
except ValueError:
if "Stay signed in" in pageHTML and "Trouble signing in" in pageHTML:
#the user needs to be signed in to Yahoo
print ("Error. The group you are trying to archive is a private group. To archive a private group using this tool, login to a Yahoo account that has access to the private groups, then extract the data from the cookies Y and T from the domain yahoo.com . Paste this data into the appropriate variables (cookie_Y and cookie_T) at the top of this script, and run the script again.")
sys.exit()
return pageJson["ygData"]["totalRecords"]
def archive_message(groupName, msgNumber, depth=0):
global failed
failed = False
s = requests.Session()
resp = s.get('https://groups.yahoo.com/api/v1/groups/' + groupName + '/messages/' + str(msgNumber) + '/raw', cookies={'T': cookie_T, 'Y': cookie_Y})
if resp.status_code != 200:
#some other problem, perhaps being refused access by Yahoo?
#retry for a max of 3 times anyway
if depth < 3:
print ("Cannot get message " + str(msgNumber) + ", attempt " + str(depth+1) + " of 3 due to HTTP status code " + str(resp.status_code))
time.sleep(0.1)
archive_message(groupName,msgNumber,depth+1)
else:
if resp.status_code == 500:
#we are most likely being blocked by Yahoo
log("Archive halted - it appears Yahoo has blocked you.", groupName)
log("Check if you can access the group's homepage from your browser. If you can't, you have been blocked.", groupName)
log("Don't worry, in a few hours (normally less than 3) you'll be unblocked and you can run this script again - it'll continue where you left off." ,groupName)
sys.exit()
log("Failed to retrive message " + str(msgNumber) + " due to HTTP status code " + str(resp.status_code), groupName )
failed = True
if failed == True:
return False
msgJson = resp.text
writeFile = open((groupName + "/" + str(msgNumber) + ".json"), "wb")
writeFile.write(msgJson.encode('utf-8'))
writeFile.close()
return True
global writeLogFile
def log(msg, groupName):
print (msg)
if writeLogFile:
logF = open(groupName + ".txt", "a")
logF.write("\n" + msg)
logF.close()
if __name__ == "__main__":
global writeLogFile
writeLogFile = True
os.chdir(os.path.dirname(os.path.abspath(__file__)))
if "nologs" in sys.argv:
print ("Logging mode OFF")
writeLogFile = False
sys.argv.remove("nologs")
if len(sys.argv) > 2:
archive_group(sys.argv[1], sys.argv[2])
else:
archive_group(sys.argv[1])