-
Notifications
You must be signed in to change notification settings - Fork 1
/
htmltokens.py
105 lines (81 loc) · 2.6 KB
/
htmltokens.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#-------------------------------------------------------------------------------
# Name: lexhtml
# Purpose:
#
# Author: TRINITI
#
# Created: 22-07-2014
# Copyright: (c) TRINITI 2014
# Licence: <your licence>
#-------------------------------------------------------------------------------
import ply.lex as lex
#import re
tokens=('LANGLE','LANGLESLASH','RANGLE','SLASHRANGLE',
'EQUAL',
'STRING','WORD','NUMBER',
'JAVASCRIPT',)
states = ( ('htmlcomment', 'exclusive') , # <-- HTML COMMENTS -->
('javascript','exclusive'), ) # JAVASCRIPT <script>.....
t_ignore = ' \t\v\r' # whitespace
def t_javascript(token):
r'\<script\ type=\"text\/javascript\"\>'
token.lexer.code_start=token.lexer.lexpos
token.lexer.begin("javascript")
def t_javascript_end(token):
r'\<\/script\>'
token.value=token.lexer.lexdata[token.lexer.code_start:token.lexer.lexpos-9]
token.type='JAVASCRIPT'
token.lexer.lineno+=token.value.count('\n')
token.lexer.begin('INITIAL')
return token
def t_javascript_error(token):
token.lexer.skip(1)
def t_htmlcomment(token):
r'<!--'
token.lexer.begin('htmlcomment')
def t_htmlcomment_newline(token):
r'\n'
token.lexer.lineno += 1
def t_htmlcomment_end(token):
r'-->'
token.lexer.lineno += token.value.count('\n')#there is bug in this code thats why i added another function above to count newlines
token.lexer.begin('INITIAL')
def t_htmlcomment_error(token):
token.lexer.skip(1)
def t_newline(token):
r'\n'
token.lexer.lineno +=1
pass
def t_comment(token):
r'/<--(.|\n)*?\-->'
token.lexer.lineno += token.value.count('\n')
t_htmlcomment_ignore = ' \t\v\r'
t_javascript_ignore = ' \t\v\r'
t_LANGLESLASH= r'</'
t_LANGLE= r'<'
t_RANGLE= r'>'
t_SLASHRANGLE= 'r/>'
t_EQUAL= r'='
def t_STRING(token):
r'(?:"[^"]*"|\'[^\']*\')'
token.value = token.value[1:-1] # concat the ' or " around string
return token
def t_WORD(token):
r'[^<>\n=]+'
return token
def t_error(t):
print "HTML Lexer: Illegal character " + t.value[0]
t.lexer.skip(1)
webpage="""this is
<b> my<!---
Nishant Oli---> </b>webpage!<script type="text/javascript"> int + * =
//nishant
goli </script>
hi"""
htmllexer=lex.lex()
htmllexer.input(webpage)
while True:
tok=htmllexer.token()
if not tok:
break
print tok