forked from andrewferguson/YahooGroups-Archiver
-
Notifications
You must be signed in to change notification settings - Fork 1
/
make_Yearly_Text_Archive_html.py
executable file
·126 lines (108 loc) · 4.77 KB
/
make_Yearly_Text_Archive_html.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/local/bin/python
'''
Yahoo-Groups-Archiver, HTML Archive Script Copyright 2019 Robert Lancaster and others
YahooGroups-Archiver, a simple python script that allows for all
messages in a public Yahoo Group to be archived.
The HTML Archive Script allows you to take the downloaded json documents
and turn them into html-based yearly archives of emails.
Note that the archive-group.py script must be run first.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
'''
import email
import HTMLParser
import json
import os
import sys
from datetime import datetime
from natsort import natsorted, ns
#To avoid Unicode Issues
reload(sys)
sys.setdefaultencoding('utf-8')
def archiveYahooMessage(file, archiveFile, messageYear, format):
try:
f = open(archiveFile, 'a')
f.write(loadYahooMessage(file, format))
f.close()
print 'Yahoo Message: ' + file + ' archived to: archive-' + str(messageYear) + '.html'
except Exception as e:
print 'Yahoo Message: ' + file + ' had an error:'
print e
def loadYahooMessage(file, format):
f1 = open(file,'r')
fileContents=f1.read()
f1.close()
jsonDoc = json.loads(fileContents)
emailMessageID = jsonDoc['ygData']['msgId']
emailMessageSender = HTMLParser.HTMLParser().unescape(jsonDoc['ygData']['from']).decode(format).encode('utf-8')
emailMessageTimeStamp = jsonDoc['ygData']['postDate']
emailMessageDateTime = datetime.fromtimestamp(float(emailMessageTimeStamp)).strftime('%Y-%m-%d %H:%M:%S')
emailMessageSubject = HTMLParser.HTMLParser().unescape(jsonDoc['ygData']['subject']).decode(format).encode('utf-8')
emailMessageString = HTMLParser.HTMLParser().unescape(jsonDoc['ygData']['rawEmail']).decode(format).encode('utf-8')
message = email.message_from_string(emailMessageString)
messageBody = getEmailBody(message)
messageText = '-----------------------------------------------------------------------------------<br>'
messageText += 'Post ID:' + str(emailMessageID) + '<br>'
messageText += 'Sender:' + emailMessageSender + '<br>'
messageText += 'Post Date/Time:' + emailMessageDateTime + '<br>'
messageText += 'Subject:' + emailMessageSubject + '<br>'
messageText += 'Message:' + '<br><br>'
messageText += messageBody
messageText += '<br><br><br><br><br>'
return messageText
def getYahooMessageYear(file):
f1 = open(file,'r')
fileContents=f1.read()
f1.close()
jsonDoc = json.loads(fileContents)
emailMessageTimeStamp = jsonDoc['ygData']['postDate']
return datetime.fromtimestamp(float(emailMessageTimeStamp)).year
# Thank you to the help in this forum for the bulk of this function
# https://stackoverflow.com/questions/17874360/python-how-to-parse-the-body-from-a-raw-email-given-that-raw-email-does-not
def getEmailBody(message):
body = ''
if message.is_multipart():
for part in message.walk():
ctype = part.get_content_type()
cdispo = str(part.get('Content-Disposition'))
# skip any text/plain (txt) attachments
if ctype == 'text/plain' and 'attachment' not in cdispo:
body += '<pre>'
body += part.get_payload(decode=True) # decode
body += '</pre>'
break
# not multipart - i.e. plain text, no attachments, keeping fingers crossed
else:
ctype = message.get_content_type()
if ctype != 'text/html':
body += '<pre>'
body += message.get_payload(decode=True)
if ctype != 'text/html':
body += '</pre>'
return body
## This is where the script starts
if len(sys.argv) < 2:
sys.exit('You need to specify your group name')
groupName = sys.argv[1]
oldDir = os.getcwd()
if os.path.exists(groupName):
archiveDir = os.path.abspath(groupName + '-archive')
if not os.path.exists(archiveDir):
os.makedirs(archiveDir)
os.chdir(groupName)
for file in natsorted(os.listdir(os.getcwd())):
messageYear = getYahooMessageYear(file)
archiveFile = archiveDir + '/archive-' + str(messageYear) + '.html'
archiveYahooMessage(file, archiveFile, messageYear, 'utf-8')
else:
sys.exit('Please run archive-group.py first')
os.chdir(oldDir)
print('Complete')