-
-
Notifications
You must be signed in to change notification settings - Fork 5
/
pdf.go
173 lines (141 loc) · 4.08 KB
/
pdf.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
// pdf/pdf.go
package pdf
import (
"fmt"
"io"
"net/http"
"os"
"path/filepath"
"strings"
"github.com/ledongthuc/pdf"
)
// ConvertPDFToMarkdown converts a PDF file to markdown format
func ConvertPDFToMarkdown(path string, isURL bool) (string, error) {
var reader io.ReadCloser
var err error
if isURL {
reader, err = downloadPDF(path)
if err != nil {
return "", fmt.Errorf("failed to download PDF: %w", err)
}
defer reader.Close()
tempFile, err := os.CreateTemp("", "ingest-*.pdf")
if err != nil {
return "", fmt.Errorf("failed to create temp file: %w", err)
}
defer os.Remove(tempFile.Name())
defer tempFile.Close()
if _, err := io.Copy(tempFile, reader); err != nil {
return "", fmt.Errorf("failed to save PDF: %w", err)
}
path = tempFile.Name()
}
// Open and read the PDF
f, r, err := pdf.Open(path)
if err != nil {
return "", fmt.Errorf("failed to open PDF: %w", err)
}
defer f.Close()
var buf strings.Builder
buf.WriteString(fmt.Sprintf("# PDF Content: %s\n\n", filepath.Base(path)))
// Extract text from each page
totalPages := r.NumPage()
for pageNum := 1; pageNum <= totalPages; pageNum++ {
page := r.Page(pageNum)
if page.V.IsNull() {
continue
}
text, err := page.GetPlainText(nil)
if err != nil {
return "", fmt.Errorf("failed to extract text from page %d: %w", pageNum, err)
}
// Clean and process the text
cleanedText := cleanText(text)
if cleanedText != "" {
buf.WriteString(fmt.Sprintf("## Page %d\n\n", pageNum))
buf.WriteString(cleanedText)
buf.WriteString("\n\n")
}
}
result := buf.String()
if strings.TrimSpace(result) == strings.TrimSpace(fmt.Sprintf("# PDF Content: %s\n\n", filepath.Base(path))) {
return "", fmt.Errorf("no text content could be extracted from PDF")
}
return result, nil
}
// IsPDF checks if a file is a PDF based on its content type or extension
func IsPDF(path string) (bool, error) {
// Check if it's a URL
if strings.HasPrefix(path, "http://") || strings.HasPrefix(path, "https://") {
resp, err := http.Head(path)
if err != nil {
return false, fmt.Errorf("failed to check URL for PDF: %w", err)
}
defer resp.Body.Close()
return resp.Header.Get("Content-Type") == "application/pdf", nil
}
// Check local file
file, err := os.Open(path)
if err != nil {
return false, fmt.Errorf("failed to open file: %w", err)
}
defer file.Close()
// Read first 512 bytes to determine file type
buffer := make([]byte, 512)
n, err := file.Read(buffer)
if err != nil && err != io.EOF {
return false, fmt.Errorf("failed to read file header: %w", err)
}
// Check file signature
contentType := http.DetectContentType(buffer[:n])
if contentType == "application/pdf" {
return true, nil
}
// Also check file extension
return strings.ToLower(filepath.Ext(path)) == ".pdf", nil
}
func downloadPDF(url string) (io.ReadCloser, error) {
resp, err := http.Get(url)
if err != nil {
return nil, err
}
if resp.StatusCode != http.StatusOK {
resp.Body.Close()
return nil, fmt.Errorf("failed to download PDF: status code %d", resp.StatusCode)
}
if resp.Header.Get("Content-Type") != "application/pdf" {
resp.Body.Close()
return nil, fmt.Errorf("URL does not point to a PDF file")
}
return resp.Body, nil
}
func cleanText(text string) string {
if strings.Contains(text, "%PDF-") || strings.Contains(text, "endobj") {
// This appears to be raw PDF data rather than extracted text
return ""
}
// Remove control characters except newlines and tabs
text = strings.Map(func(r rune) rune {
if r < 32 && r != '\n' && r != '\t' {
return -1
}
return r
}, text)
// Split into lines and clean each line
lines := strings.Split(text, "\n")
var cleanLines []string
for _, line := range lines {
line = strings.TrimSpace(line)
// Skip empty lines and lines that look like PDF syntax
if line == "" ||
strings.HasPrefix(line, "%") ||
strings.HasPrefix(line, "/") ||
strings.Contains(line, "obj") ||
strings.Contains(line, "endobj") ||
strings.Contains(line, "stream") {
continue
}
cleanLines = append(cleanLines, line)
}
return strings.Join(cleanLines, "\n\n")
}