Browse Source

Initial commit

Phyks (Lucas Verney) 3 years ago
commit
73a33a49aa
3 changed files with 162 additions and 0 deletions
  1. 2
    0
      .gitignore
  2. 43
    0
      README.md
  3. 117
    0
      main.c

+ 2
- 0
.gitignore View File

@@ -0,0 +1,2 @@
1
+latex2plain
2
+a.out*

+ 43
- 0
README.md View File

@@ -0,0 +1,43 @@
1
+LaTeX2Plain
2
+===========
3
+
4
+This is kind of a "daemon" mode for Pandoc, enabling one to convert single
5
+lines of LaTeX to plaintext output using Pandoc with a single long-running
6
+process.
7
+
8
+[Dissemin](https://github.com/dissemin/dissemin/) project was looking for a
9
+way to batch-convert LaTeX titles to plaintext, efficiently (as it was
10
+supposed to be running on a few millions of lines). The only reliable solution
11
+for such a conversion out there is Pandoc, but Pandoc does not work in a
12
+"daemon" mode, thus requiring to spawn one process per line to process. This
13
+was way too much (and due to the Dissemin pipeline, we could not do a single
14
+batch conversion on many lines).
15
+
16
+This is an attempt at solving this issue. It exposes a long running process
17
+which you can communicate with _via_ sockets.
18
+
19
+Once the Haskell stack has been loaded and initialized, it takes around 1ms to
20
+process a typical line (scientific article title).
21
+
22
+
23
+## Installation
24
+
25
+To build and use this code, you will need to build and install
26
+[`libpandoc`](https://github.com/ShabbyX/libpandoc). You also need an up to
27
+date install of Pandoc.
28
+
29
+You can then clone this repo and build the main code:
30
+```
31
+gcc main.c -lpandoc -o latex2plain
32
+```
33
+
34
+
35
+## Usage
36
+
37
+**TODO**
38
+
39
+
40
+## License
41
+
42
+This code is released under GPLv2 or later, which is also the license for
43
+`libpandoc` and `Pandoc`.

+ 117
- 0
main.c View File

@@ -0,0 +1,117 @@
1
+/* Code inspired by the one from [this
2
+ * comment](https://github.com/ShabbyX/libpandoc/issues/9#issuecomment-290947373).
3
+ */
4
+#include <assert.h>
5
+#include <stdio.h>
6
+#include <stdlib.h>
7
+#include <string.h>
8
+
9
+#include <pandoc.h>
10
+
11
+#define BUF_SIZE 1024
12
+
13
+struct data
14
+{
15
+	const char *src;
16
+	size_t src_len;
17
+	size_t src_cur;
18
+
19
+	char *dst;
20
+	size_t dst_len;
21
+	size_t dst_cur;
22
+};
23
+
24
+int reader(char *buf, void *user_data)
25
+{
26
+	struct data *d = user_data;
27
+
28
+	/* See how much is left to read */
29
+	size_t to_write = d->src_len - d->src_cur;
30
+
31
+	/* If finished, tell that to pandoc */
32
+	if (to_write == 0) {
33
+		return 0;
34
+    }
35
+
36
+	/* Make sure not to overflow the buffer */
37
+	if (to_write > BUF_SIZE) {
38
+		to_write = BUF_SIZE;
39
+    }
40
+
41
+	/* Copy this chunk of data */
42
+	memcpy(buf, d->src + d->src_cur, to_write * sizeof *buf);
43
+	d->src_cur += to_write;
44
+
45
+	return to_write;
46
+}
47
+
48
+void writer(const char *buf, int len, void *user_data)
49
+{
50
+	struct data *d = user_data;
51
+
52
+	/* If current buffer is too small, increase its size */
53
+	if (d->dst_cur + len > d->dst_len)
54
+	{
55
+		d->dst_len = d->dst_len == 0 ? 16 : d->dst_len * 2;
56
+		if (d->dst_len < len) {
57
+			d->dst_len = len;
58
+        }
59
+
60
+		char *new_dst = realloc(d->dst, d->dst_len * sizeof *d->dst);
61
+		assert(new_dst != NULL);
62
+
63
+		d->dst = new_dst;
64
+	}
65
+
66
+	/* Append what pandoc has produced */
67
+	memcpy(d->dst + d->dst_cur, buf, len * sizeof *buf);
68
+	d->dst_cur += len;
69
+}
70
+
71
+
72
+int main()
73
+{
74
+    const char *settings =
75
+		"{\n"
76
+			"\"writerOptions\": {\n"
77
+				"\"writerStandalone\": false\n"
78
+			"}\n"
79
+		"}";
80
+	char *buffer;
81
+    size_t len = 0;
82
+    ssize_t nRead = 0;
83
+
84
+
85
+	pandoc_init();
86
+
87
+    while ((nRead = getline(&buffer, &len, stdin)) != -1) {
88
+        struct data d = {
89
+            .src = buffer,
90
+            .src_len = nRead
91
+            /* Note: everything else is 0 initalized */
92
+        };
93
+
94
+        char *error = pandoc(BUF_SIZE,
95
+                "markdown",
96
+                "plain",
97
+                NULL,
98
+                reader,
99
+                writer,
100
+                &d);
101
+
102
+        if (error) {
103
+            fprintf(stderr, "ERROR: %s", error);
104
+        } else {
105
+            fwrite(d.dst, sizeof *d.dst, d.dst_cur, stdout);
106
+            printf("\n");
107
+        }
108
+
109
+        free(d.dst);
110
+        free(error);
111
+    }
112
+
113
+	pandoc_exit();
114
+    free(buffer);
115
+
116
+	return 0;
117
+}