Browse Source

Tearpages ok

Phyks (Lucas Verney) 3 years ago
parent
commit
69e9414742
3 changed files with 96 additions and 26 deletions
  1. 0
    7
      README.md
  2. 95
    18
      libbmc/papers/tearpages.py
  3. 1
    1
      requirements.txt

+ 0
- 7
README.md View File

@@ -1,13 +1,6 @@
1 1
 libBMC
2 2
 ======
3 3
 
4
-This is a **WIP**.
5
-
6
-## TODO
7
-
8
-* Generate documentation
9
-
10
-
11 4
 ## Presentation
12 5
 
13 6
 A generic Python library to manage bibliography and play with scientific

+ 95
- 18
libbmc/papers/tearpages.py View File

@@ -4,44 +4,121 @@ the first page from a PDF file, and actually tear it.
4 4
 
5 5
 TODO: Unittests
6 6
 """
7
-import tearpages
7
+import shutil
8
+import tempfile
9
+from PyPDF2 import PdfFileWriter, PdfFileReader
10
+from PyPDF2.utils import PdfReadError
8 11
 
9 12
 
10
-# List of bad publishers which adds an extra useless first page, which can be
13
+# Dict of bad publishers which adds an extra useless first page, which can be
11 14
 # teared. Please, submit a PR to include new ones which I may not be aware of!
12
-BAD_PUBLISHERS = [
13
-    "IOP"
14
-]
15
+# This dict associates the publisher string to look for and to a list of pages
16
+# to tear.
17
+BAD_PUBLISHERS = {
18
+    "IOP": [0]
19
+}
20
+
21
+
22
+def fixPdf(pdfFile, destination):
23
+    """
24
+    Fix malformed pdf files when data are present after '%%EOF'
25
+
26
+    ..note ::
27
+
28
+        Originally from sciunto, https://github.com/sciunto/tear-pages
29
+
30
+    :param pdfFile: PDF filepath
31
+    :param destination: destination
32
+    """
33
+    tmp = tempfile.NamedTemporaryFile()
34
+    output = open(tmp.name, 'wb')
35
+    with open(pdfFile, "rb") as fh:
36
+        with open(pdfFile, "rb") as fh:
37
+            for line in fh:
38
+                output.write(line)
39
+                if b'%%EOF' in line:
40
+                    break
41
+    output.close()
42
+    shutil.copy(tmp.name, destination)
43
+
44
+
45
+def tearpage_backend(filename, teared_pages=[0]):
46
+    """
47
+    Copy filename to a tempfile, write pages to filename except the teared one.
48
+
49
+    ..note ::
50
+
51
+        Adapted from sciunto's code, https://github.com/sciunto/tear-pages
52
+
53
+    :param filename: PDF filepath
54
+    :param teared_pages: Numbers of the pages to tear. Default to first page \
55
+            only.
56
+    """
57
+    # Copy the pdf to a tmp file
58
+    tmp = tempfile.NamedTemporaryFile()
59
+    shutil.copy(filename, tmp.name)
60
+
61
+    # Read the copied pdf
62
+    try:
63
+        input_file = PdfFileReader(open(tmp.name, 'rb'))
64
+    except PdfReadError:
65
+        fixPdf(filename, tmp.name)
66
+        input_file = PdfFileReader(open(tmp.name, 'rb'))
67
+    # Seek for the number of pages
68
+    num_pages = input_file.getNumPages()
69
+
70
+    # Write pages excepted the first one
71
+    output_file = PdfFileWriter()
72
+    for i in range(num_pages):
73
+        if i in teared_pages:
74
+            continue
75
+        output_file.addPage(input_file.getPage(i))
76
+
77
+    tmp.close()
78
+    outputStream = open(filename, "wb")
79
+    output_file.write(outputStream)
15 80
 
16 81
 
17 82
 def tearpage_needed(bibtex):
18 83
     """
19
-    Check whether a given paper needs the first page to be teared or not.
84
+    Check whether a given paper needs some pages to be teared or not.
20 85
 
21 86
     :params bibtex: The bibtex entry associated to the paper, to guess \
22 87
             whether tearing is needed.
23
-    :returns: A boolean indicating whether first page should be teared or not.
88
+    :returns: A list of pages to tear.
24 89
     """
25
-    # For each bad publisher, look for it in the publisher bibtex entry
26
-    has_bad_publisher = [p in bibtex.get("publisher", [])
27
-                         for p in BAD_PUBLISHERS]
28
-    # Return True iff there is at least one bad publisher
29
-    return (True in has_bad_publisher)
90
+    for p in BAD_PUBLISHERS:
91
+        if p in bibtex.get("publisher", ""):
92
+            # Bad publisher is found, add pages to tear
93
+            return BAD_PUBLISHERS[p]
94
+
95
+    # If no bad publishers are found, return an empty list
96
+    return []
30 97
 
31 98
 
32
-def tearpage(filename, bibtex=None):
99
+def tearpage(filename, bibtex=None, force=False):
33 100
     """
34
-    Tear the first page of the file if needed.
101
+    Tear the some pages of the file if needed.
35 102
 
36 103
     :params filename: Path to the file to handle.
37 104
     :params bibtex: BibTeX dict associated to this file, as the one given by \
38
-            ``bibtexparser``.
105
+            ``bibtexparser``. (Mandatory if force is not specified)
106
+    :params force: If a list of integers, force the tearing of the \
107
+            specified pages. (Optional)
39 108
     :returns: A boolean indicating whether the file has been teared or not. \
40
-            Side effect is tearing the first page from the file.
109
+            Side effect is tearing the necessary pages from the file.
41 110
     """
42
-    if bibtex is not None and tearpage_needed(bibtex):
111
+    # Fetch pages to tear
112
+    pages_to_tear = []
113
+    if force is not False:
114
+        pages_to_tear = force
115
+    elif bibtex is not None:
116
+        pages_to_tear = tearpage_needed(bibtex)
117
+
118
+    if len(pages_to_tear) > 0:
43 119
         # If tearing is needed, do it and return True
44
-        tearpages.tearpage(filename)
120
+        tearpage_backend(filename, teared_pages=pages_to_tear)
45 121
         return True
122
+
46 123
     # Else, simply return False
47 124
     return False

+ 1
- 1
requirements.txt View File

@@ -3,4 +3,4 @@ bibtexparser>=0.6.2
3 3
 isbnlib>=3.5.7
4 4
 requests>=2.9.1
5 5
 PySocks>=1.5.6
6
-#tear-pages>=0.2.2
6
+PyPDF2>=1.25.1