Browse Source

Initial commit

Phyks (Lucas Verney) 2 years ago
parent
commit
d7012e3834
No known key found for this signature in database

+ 407
- 0
.ci/pylintrc View File

@@ -0,0 +1,407 @@
1
+[MASTER]
2
+
3
+# Specify a configuration file.
4
+#rcfile=
5
+
6
+# Python code to execute, usually for sys.path manipulation such as
7
+# pygtk.require().
8
+#init-hook=
9
+
10
+# Add files or directories to the blacklist. They should be base names, not
11
+# paths.
12
+ignore=CVS
13
+
14
+# Add files or directories matching the regex patterns to the blacklist. The
15
+# regex matches against base names, not paths.
16
+ignore-patterns=
17
+
18
+# Pickle collected data for later comparisons.
19
+persistent=yes
20
+
21
+# List of plugins (as comma separated values of python modules names) to load,
22
+# usually to register additional checkers.
23
+load-plugins=
24
+
25
+# Use multiple processes to speed up Pylint.
26
+jobs=1
27
+
28
+# Allow loading of arbitrary C extensions. Extensions are imported into the
29
+# active Python interpreter and may run arbitrary code.
30
+unsafe-load-any-extension=no
31
+
32
+# A comma-separated list of package or module names from where C extensions may
33
+# be loaded. Extensions are loading into the active Python interpreter and may
34
+# run arbitrary code
35
+extension-pkg-whitelist=
36
+
37
+# Allow optimization of some AST trees. This will activate a peephole AST
38
+# optimizer, which will apply various small optimizations. For instance, it can
39
+# be used to obtain the result of joining multiple strings with the addition
40
+# operator. Joining a lot of strings can lead to a maximum recursion error in
41
+# Pylint and this flag can prevent that. It has one side effect, the resulting
42
+# AST will be different than the one from reality. This option is deprecated
43
+# and it will be removed in Pylint 2.0.
44
+optimize-ast=no
45
+
46
+
47
+[MESSAGES CONTROL]
48
+
49
+# Only show warnings with the listed confidence levels. Leave empty to show
50
+# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED
51
+confidence=
52
+
53
+# Enable the message, report, category or checker with the given id(s). You can
54
+# either give multiple identifier separated by comma (,) or put this option
55
+# multiple time (only on the command line, not in the configuration file where
56
+# it should appear only once). See also the "--disable" option for examples.
57
+#enable=
58
+
59
+# Disable the message, report, category or checker with the given id(s). You
60
+# can either give multiple identifiers separated by comma (,) or put this
61
+# option multiple times (only on the command line, not in the configuration
62
+# file where it should appear only once).You can also use "--disable=all" to
63
+# disable everything first and then reenable specific checks. For example, if
64
+# you want to run only the similarities checker, you can use "--disable=all
65
+# --enable=similarities". If you want to run only the classes checker, but have
66
+# no Warning level messages displayed, use"--disable=all --enable=classes
67
+# --disable=W"
68
+disable=import-star-module-level,old-octal-literal,oct-method,print-statement,unpacking-in-except,parameter-unpacking,backtick,old-raise-syntax,old-ne-operator,long-suffix,dict-view-method,dict-iter-method,metaclass-assignment,next-method-called,raising-string,indexing-exception,raw_input-builtin,long-builtin,file-builtin,execfile-builtin,coerce-builtin,cmp-builtin,buffer-builtin,basestring-builtin,apply-builtin,filter-builtin-not-iterating,using-cmp-argument,useless-suppression,range-builtin-not-iterating,suppressed-message,no-absolute-import,old-division,cmp-method,reload-builtin,zip-builtin-not-iterating,intern-builtin,unichr-builtin,reduce-builtin,standarderror-builtin,unicode-builtin,xrange-builtin,coerce-method,delslice-method,getslice-method,setslice-method,input-builtin,round-builtin,hex-method,nonzero-method,map-builtin-not-iterating
69
+
70
+
71
+[REPORTS]
72
+
73
+# Set the output format. Available formats are text, parseable, colorized, msvs
74
+# (visual studio) and html. You can also give a reporter class, eg
75
+# mypackage.mymodule.MyReporterClass.
76
+output-format=text
77
+
78
+# Put messages in a separate file for each module / package specified on the
79
+# command line instead of printing them on stdout. Reports (if any) will be
80
+# written in a file name "pylint_global.[txt|html]". This option is deprecated
81
+# and it will be removed in Pylint 2.0.
82
+files-output=no
83
+
84
+# Tells whether to display a full report or only the messages
85
+reports=yes
86
+
87
+# Python expression which should return a note less than 10 (10 is the highest
88
+# note). You have access to the variables errors warning, statement which
89
+# respectively contain the number of errors / warnings messages and the total
90
+# number of statements analyzed. This is used by the global evaluation report
91
+# (RP0004).
92
+evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
93
+
94
+# Template used to display messages. This is a python new-style format string
95
+# used to format the message information. See doc for all details
96
+#msg-template=
97
+
98
+
99
+[BASIC]
100
+
101
+# Good variable names which should always be accepted, separated by a comma
102
+good-names=i,j,k,ex,Run,_,fh
103
+
104
+# Bad variable names which should always be refused, separated by a comma
105
+bad-names=foo,bar,baz,toto,tutu,tata
106
+
107
+# Colon-delimited sets of names that determine each other's naming style when
108
+# the name regexes allow several styles.
109
+name-group=
110
+
111
+# Include a hint for the correct naming format with invalid-name
112
+include-naming-hint=no
113
+
114
+# List of decorators that produce properties, such as abc.abstractproperty. Add
115
+# to this list to register other decorators that produce valid properties.
116
+property-classes=abc.abstractproperty
117
+
118
+# Regular expression matching correct function names
119
+function-rgx=[a-z_][a-z0-9_]{2,30}$
120
+
121
+# Naming hint for function names
122
+function-name-hint=[a-z_][a-z0-9_]{2,30}$
123
+
124
+# Regular expression matching correct variable names
125
+variable-rgx=[a-z_][a-z0-9_]{2,30}$
126
+
127
+# Naming hint for variable names
128
+variable-name-hint=[a-z_][a-z0-9_]{2,30}$
129
+
130
+# Regular expression matching correct constant names
131
+const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$
132
+
133
+# Naming hint for constant names
134
+const-name-hint=(([A-Z_][A-Z0-9_]*)|(__.*__))$
135
+
136
+# Regular expression matching correct attribute names
137
+attr-rgx=[a-z_][a-z0-9_]{2,30}$
138
+
139
+# Naming hint for attribute names
140
+attr-name-hint=[a-z_][a-z0-9_]{2,30}$
141
+
142
+# Regular expression matching correct argument names
143
+argument-rgx=[a-z_][a-z0-9_]{2,30}$
144
+
145
+# Naming hint for argument names
146
+argument-name-hint=[a-z_][a-z0-9_]{2,30}$
147
+
148
+# Regular expression matching correct class attribute names
149
+class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
150
+
151
+# Naming hint for class attribute names
152
+class-attribute-name-hint=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
153
+
154
+# Regular expression matching correct inline iteration names
155
+inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$
156
+
157
+# Naming hint for inline iteration names
158
+inlinevar-name-hint=[A-Za-z_][A-Za-z0-9_]*$
159
+
160
+# Regular expression matching correct class names
161
+class-rgx=[A-Z_][a-zA-Z0-9]+$
162
+
163
+# Naming hint for class names
164
+class-name-hint=[A-Z_][a-zA-Z0-9]+$
165
+
166
+# Regular expression matching correct module names
167
+module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
168
+
169
+# Naming hint for module names
170
+module-name-hint=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
171
+
172
+# Regular expression matching correct method names
173
+method-rgx=[a-z_][a-z0-9_]{2,30}$
174
+
175
+# Naming hint for method names
176
+method-name-hint=[a-z_][a-z0-9_]{2,30}$
177
+
178
+# Regular expression which should only match function or class names that do
179
+# not require a docstring.
180
+no-docstring-rgx=^_
181
+
182
+# Minimum line length for functions/classes that require docstrings, shorter
183
+# ones are exempt.
184
+docstring-min-length=-1
185
+
186
+
187
+[ELIF]
188
+
189
+# Maximum number of nested blocks for function / method body
190
+max-nested-blocks=5
191
+
192
+
193
+[FORMAT]
194
+
195
+# Maximum number of characters on a single line.
196
+max-line-length=100
197
+
198
+# Regexp for a line that is allowed to be longer than the limit.
199
+ignore-long-lines=^\s*(# )?<?https?://\S+>?$
200
+
201
+# Allow the body of an if to be on the same line as the test if there is no
202
+# else.
203
+single-line-if-stmt=no
204
+
205
+# List of optional constructs for which whitespace checking is disabled. `dict-
206
+# separator` is used to allow tabulation in dicts, etc.: {1  : 1,\n222: 2}.
207
+# `trailing-comma` allows a space between comma and closing bracket: (a, ).
208
+# `empty-line` allows space-only lines.
209
+no-space-check=trailing-comma,dict-separator
210
+
211
+# Maximum number of lines in a module
212
+max-module-lines=1000
213
+
214
+# String used as indentation unit. This is usually "    " (4 spaces) or "\t" (1
215
+# tab).
216
+indent-string='    '
217
+
218
+# Number of spaces of indent required inside a hanging  or continued line.
219
+indent-after-paren=4
220
+
221
+# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
222
+expected-line-ending-format=
223
+
224
+
225
+[LOGGING]
226
+
227
+# Logging modules to check that the string format arguments are in logging
228
+# function parameter format
229
+logging-modules=logging
230
+
231
+
232
+[MISCELLANEOUS]
233
+
234
+# List of note tags to take in consideration, separated by a comma.
235
+notes=FIXME,XXX,TODO
236
+
237
+
238
+[SIMILARITIES]
239
+
240
+# Minimum lines number of a similarity.
241
+min-similarity-lines=4
242
+
243
+# Ignore comments when computing similarities.
244
+ignore-comments=yes
245
+
246
+# Ignore docstrings when computing similarities.
247
+ignore-docstrings=yes
248
+
249
+# Ignore imports when computing similarities.
250
+ignore-imports=no
251
+
252
+
253
+[SPELLING]
254
+
255
+# Spelling dictionary name. Available dictionaries: none. To make it working
256
+# install python-enchant package.
257
+spelling-dict=
258
+
259
+# List of comma separated words that should not be checked.
260
+spelling-ignore-words=
261
+
262
+# A path to a file that contains private dictionary; one word per line.
263
+spelling-private-dict-file=
264
+
265
+# Tells whether to store unknown words to indicated private dictionary in
266
+# --spelling-private-dict-file option instead of raising a message.
267
+spelling-store-unknown-words=no
268
+
269
+
270
+[TYPECHECK]
271
+
272
+# Tells whether missing members accessed in mixin class should be ignored. A
273
+# mixin class is detected if its name ends with "mixin" (case insensitive).
274
+ignore-mixin-members=yes
275
+
276
+# List of module names for which member attributes should not be checked
277
+# (useful for modules/projects where namespaces are manipulated during runtime
278
+# and thus existing member attributes cannot be deduced by static analysis. It
279
+# supports qualified module names, as well as Unix pattern matching.
280
+ignored-modules=
281
+
282
+# List of class names for which member attributes should not be checked (useful
283
+# for classes with dynamically set attributes). This supports the use of
284
+# qualified names.
285
+ignored-classes=optparse.Values,thread._local,_thread._local
286
+
287
+# List of members which are set dynamically and missed by pylint inference
288
+# system, and so shouldn't trigger E1101 when accessed. Python regular
289
+# expressions are accepted.
290
+generated-members=
291
+
292
+# List of decorators that produce context managers, such as
293
+# contextlib.contextmanager. Add to this list to register other decorators that
294
+# produce valid context managers.
295
+contextmanager-decorators=contextlib.contextmanager
296
+
297
+
298
+[VARIABLES]
299
+
300
+# Tells whether we should check for unused import in __init__ files.
301
+init-import=no
302
+
303
+# A regular expression matching the name of dummy variables (i.e. expectedly
304
+# not used).
305
+dummy-variables-rgx=(_+[a-zA-Z0-9]*?$)|dummy
306
+
307
+# List of additional names supposed to be defined in builtins. Remember that
308
+# you should avoid to define new builtins when possible.
309
+additional-builtins=
310
+
311
+# List of strings which can identify a callback function by name. A callback
312
+# name must start or end with one of those strings.
313
+callbacks=cb_,_cb
314
+
315
+# List of qualified module names which can have objects that can redefine
316
+# builtins.
317
+redefining-builtins-modules=six.moves,future.builtins,builtins
318
+
319
+
320
+[CLASSES]
321
+
322
+# List of method names used to declare (i.e. assign) instance attributes.
323
+defining-attr-methods=__init__,__new__,setUp
324
+
325
+# List of valid names for the first argument in a class method.
326
+valid-classmethod-first-arg=cls
327
+
328
+# List of valid names for the first argument in a metaclass class method.
329
+valid-metaclass-classmethod-first-arg=mcs
330
+
331
+# List of member names, which should be excluded from the protected access
332
+# warning.
333
+exclude-protected=_asdict,_fields,_replace,_source,_make
334
+
335
+
336
+[DESIGN]
337
+
338
+# Maximum number of arguments for function / method
339
+max-args=5
340
+
341
+# Argument names that match this expression will be ignored. Default to name
342
+# with leading underscore
343
+ignored-argument-names=_.*
344
+
345
+# Maximum number of locals for function / method body
346
+max-locals=15
347
+
348
+# Maximum number of return / yield for function / method body
349
+max-returns=6
350
+
351
+# Maximum number of branch for function / method body
352
+max-branches=12
353
+
354
+# Maximum number of statements in function / method body
355
+max-statements=50
356
+
357
+# Maximum number of parents for a class (see R0901).
358
+max-parents=7
359
+
360
+# Maximum number of attributes for a class (see R0902).
361
+max-attributes=7
362
+
363
+# Minimum number of public methods for a class (see R0903).
364
+min-public-methods=2
365
+
366
+# Maximum number of public methods for a class (see R0904).
367
+max-public-methods=20
368
+
369
+# Maximum number of boolean expressions in a if statement
370
+max-bool-expr=5
371
+
372
+
373
+[IMPORTS]
374
+
375
+# Deprecated modules which should not be used, separated by a comma
376
+deprecated-modules=regsub,TERMIOS,Bastion,rexec
377
+
378
+# Create a graph of every (i.e. internal and external) dependencies in the
379
+# given file (report RP0402 must not be disabled)
380
+import-graph=
381
+
382
+# Create a graph of external dependencies in the given file (report RP0402 must
383
+# not be disabled)
384
+ext-import-graph=
385
+
386
+# Create a graph of internal dependencies in the given file (report RP0402 must
387
+# not be disabled)
388
+int-import-graph=
389
+
390
+# Force import order to recognize a module as part of the standard
391
+# compatibility libraries.
392
+known-standard-library=
393
+
394
+# Force import order to recognize a module as part of a third party library.
395
+known-third-party=enchant
396
+
397
+# Analyse import fallback blocks. This can be used to support both Python 2 and
398
+# 3 compatible code, which means that the block might have code that exists
399
+# only in one or another interpreter, leading to false positives when analysed.
400
+analyse-fallback-blocks=no
401
+
402
+
403
+[EXCEPTIONS]
404
+
405
+# Exceptions that will emit a warning when being caught. Defaults to
406
+# "Exception"
407
+overgeneral-exceptions=Exception

+ 4
- 1
.gitignore View File

@@ -1,3 +1,6 @@
1 1
 build
2 2
 *.json
3
-config.py
3
+*.pyc
4
+*.swp
5
+*.swo
6
+*.db

+ 21
- 0
LICENSE.md View File

@@ -0,0 +1,21 @@
1
+The MIT License (MIT)
2
+
3
+Copyright (c) 2017 Phyks (Lucas Verney)
4
+
5
+Permission is hereby granted, free of charge, to any person obtaining a copy
6
+of this software and associated documentation files (the "Software"), to deal
7
+in the Software without restriction, including without limitation the rights
8
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+copies of the Software, and to permit persons to whom the Software is
10
+furnished to do so, subject to the following conditions:
11
+
12
+The above copyright notice and this permission notice shall be included in all
13
+copies or substantial portions of the Software.
14
+
15
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+SOFTWARE.

+ 115
- 0
README.md View File

@@ -0,0 +1,115 @@
1
+Flatisfy
2
+========
3
+
4
+Flatisfy is your new companion to ease your search of a new housing :)
5
+
6
+
7
+It uses [Weboob](http://weboob.org/) to get all the housing posts on most of
8
+the websites offering housings posts, and then offers a bunch of pipelines to
9
+filter and deduplicate the fetched housings.
10
+
11
+
12
+It can be used as a command-line utility, but also exposes a web API and
13
+visualisation, to browse through the results.
14
+
15
+
16
+_Note_: It is targeted at French users (due to the currently supported
17
+websites), and in particular at people living close to Paris, as I developped
18
+it for my personal use, and am currently living in Paris :) Any feedback and
19
+merge requests to better support other countries / cities are more than
20
+welcome!
21
+
22
+_Note_: In this repository and across the code, I am using the name "flat". I
23
+use it as a placeholder for "housing" and consider both are interchangeable.
24
+This code is not restricted to handling flats only!
25
+
26
+
27
+## Getting started
28
+
29
+1. Clone the repository.
30
+2. Install required Python modules: `pip install -r requirements.txt`.
31
+3. Init a configuration file: `python -m flatisfy init-config > config.json`.
32
+   Edit it according to your needs (see below).
33
+4. Build the required data files:
34
+   `python -m flatisfy build-data --config config.json`.
35
+5. Use it to `fetch` (and output a filtered JSON list of flats) or `import`
36
+   (into an SQLite database, for the web visualization) a list of flats
37
+   matching your criteria.
38
+6. Use `python -m flatisfy serve --config config.json` to serve the web app.
39
+
40
+
41
+## Configuration
42
+
43
+List of configuration options:
44
+
45
+* `data_directory` is the directory in which you want data files to be stored.
46
+  `null` is the default value and means default `XDG` location (typically
47
+  `~/.local/share/flatisfy/`)
48
+* `max_entries` is the maximum number of entries to fetch **per Weboob
49
+  backend** (that is per housing website).
50
+* `passes` is the number of passes to run on the data. First pass is a basic
51
+  filtering and using only the informations from the housings list page.
52
+  Second pass loads any possible information about the filtered flats and does
53
+  better filtering.
54
+* `queries` is a list of queries defined in `flatboob` that should be fetched.
55
+* `database` is an SQLAlchemy URI to a database file. Defaults to `null` which
56
+  means that it will store the database in the default location, in
57
+  `data_directory`.
58
+* `navitia_api_key` is an API token for [Navitia](https://www.navitia.io/)
59
+  which is required to compute travel times.
60
+
61
+### Constraints
62
+
63
+You can specify constraints, under the `constraints` key. The available
64
+constraints are:
65
+
66
+* `area` (in m²), `bedrooms`, `cost` (in currency unit), `rooms`: this is a
67
+  tuple of `(min, max)` values, defining an interval in which the value should
68
+  lie. A `null` value means that any value is within this bound.
69
+* `postal_codes` is a list of allowed postal codes. You should include any
70
+  postal code you want, and especially the postal codes close to the precise
71
+  location you want. You MUST provide some postal codes.
72
+* `time_to` is a dictionary of places to compute travel time to them.
73
+  Typically,
74
+  ```
75
+  "time_to": {
76
+    "foobar": {
77
+        "gps": [LAT, LNG],
78
+        "time": [min, max]
79
+    }
80
+  }
81
+  ```
82
+  means that the housings must be between the `min` and `max` bounds (possibly
83
+  `null`) from the place identified by the GPS coordinates `LAT` and `LNG`
84
+  (latitude and longitude), and we call this place `foobar` in human-readable
85
+  form. Beware that `time` constraints are in **seconds**.
86
+
87
+
88
+## OpenData
89
+
90
+I am using the following datasets, available under `flatisfy/data_files`,
91
+which covers Paris. If you want to run the script using some other location,
92
+you might have to change these files by matching datasets.
93
+
94
+* [LaPoste Hexasmal](https://datanova.legroupe.laposte.fr/explore/dataset/laposte_hexasmal/?disjunctive.code_commune_insee&disjunctive.nom_de_la_commune&disjunctive.code_postal&disjunctive.libell_d_acheminement&disjunctive.ligne_5) for the list of cities and postal codes in France.
95
+* [RATP stations](https://data.ratp.fr/explore/dataset/positions-geographiques-des-stations-du-reseau-ratp/table/?disjunctive.stop_name&disjunctive.code_postal&disjunctive.departement) for the list of subway stations with their positions in Paris and nearby areas.
96
+
97
+Both datasets are licensed under the Open Data Commons Open Database License
98
+(ODbL): https://opendatacommons.org/licenses/odbl/.
99
+
100
+
101
+## License
102
+
103
+The content of this repository is licensed under an MIT license, unless
104
+explicitly mentionned otherwise.
105
+
106
+
107
+## Thanks
108
+
109
+* [Weboob](http://weboob.org/)
110
+* The OpenData providers listed above!
111
+* Navitia for their really cool public transportation API.
112
+* A lots of Python modules, required for this script (see `requirements.txt`).
113
+* [Kresus](https://framagit.org/bnjbvr/kresus) which gave me part of the
114
+  original idea (at least proved me such software based on scraping can
115
+  achieve a high quality level :)

+ 0
- 130
flat.py View File

@@ -1,130 +0,0 @@
1
-# coding: utf-8
2
-#!/usr/bin/env python3
3
-import json
4
-import os
5
-import subprocess
6
-import sys
7
-
8
-from fuzzywuzzy import process as fuzzyprocess
9
-
10
-import config
11
-
12
-
13
-def pretty_json(json_str):
14
-    return json.dumps(json_str, indent=4, separators=(',', ': '),
15
-                      sort_keys=True)
16
-
17
-
18
-def preprocess_data():
19
-    if not os.path.isdir("build"):
20
-        os.mkdir("build")
21
-
22
-    if not os.path.isfile("build/ratp.json"):
23
-        ratp_data = []
24
-        with open("data/ratp.json", "r") as fh:
25
-            ratp_data = json.load(fh)
26
-        ratp_data = sorted(
27
-            list(set(
28
-                x["fields"]["stop_name"].lower() for x in ratp_data
29
-            ))
30
-        )
31
-        with open("build/ratp.json", "w") as fh:
32
-            fh.write(pretty_json(ratp_data))
33
-
34
-
35
-def fetch_flats_list():
36
-    flats_list = []
37
-    for query in config.QUERIES:
38
-        flatboob_output = subprocess.check_output(
39
-            ["flatboob", "-n", "0", "-f", "json", "load", query]
40
-        )
41
-        flats_list.extend(json.loads(flatboob_output))
42
-    return flats_list
43
-
44
-
45
-def remove_duplicates(flats_list):
46
-    unique_flats_list = []
47
-    ids = []
48
-    for flat in flats_list:
49
-        if flat["id"] in ids:
50
-            continue
51
-        ids.append(id)
52
-        unique_flats_list.append(flat)
53
-    return unique_flats_list
54
-
55
-
56
-def sort_by(flats_list, key="cost"):
57
-    return sorted(flats_list, key=lambda x: x["cost"])
58
-
59
-
60
-def refine_params(flats_list):
61
-    def filter_conditions(x):
62
-        is_ok = True
63
-        if "cost" in x:
64
-            cost = x["cost"]
65
-            is_ok = (
66
-                is_ok and
67
-                (cost < config.PARAMS["max_cost"] and
68
-                 cost > config.PARAMS["min_cost"])
69
-            )
70
-        if "area" in x:
71
-            area = x["area"]
72
-            is_ok = (
73
-                is_ok and
74
-                (area < config.PARAMS["max_area"] and
75
-                 area > config.PARAMS["min_area"])
76
-            )
77
-        return is_ok
78
-
79
-    return filter(filter_conditions, flats_list)
80
-
81
-
82
-def match_ratp(flats_list):
83
-    ratp_stations = []
84
-    with open("build/ratp.json", "r") as fh:
85
-        ratp_stations = json.load(fh)
86
-
87
-    for flat in flats_list:
88
-        if "station" in flat and flat["station"]:
89
-            # There is some station fetched by flatboob, try to match it
90
-            flat["ratp_station"] = fuzzyprocess.extractOne(
91
-                flat["station"], ratp_stations
92
-            )
93
-            # TODO: Cross-check station location to choose the best fit
94
-
95
-    return flats_list
96
-
97
-
98
-def main(dumpfile=None):
99
-    if dumpfile is None:
100
-        flats_list = fetch_flats_list()
101
-    else:
102
-        with open(dumpfile, "r") as fh:
103
-            flats_list = json.load(fh)
104
-
105
-    # First pass
106
-    flats_list = remove_duplicates(flats_list)
107
-    flats_list = sort_by(flats_list, "cost")
108
-    flats_list = refine_params(flats_list)
109
-
110
-    # TODO: flats_list = match_ratp(flats_list)
111
-
112
-    # TODO: Second pass, loading additional infos for each entry
113
-
114
-    return flats_list
115
-
116
-
117
-if __name__ == "__main__":
118
-    if len(sys.argv) > 1:
119
-        dumpfile = sys.argv[1]
120
-    else:
121
-        dumpfile = None
122
-
123
-    try:
124
-        preprocess_data()
125
-        flats_list = main(dumpfile)
126
-        print(
127
-            pretty_json(flats_list)
128
-        )
129
-    except KeyboardInterrupt:
130
-        pass

+ 5
- 0
flatisfy/__init__.py View File

@@ -0,0 +1,5 @@
1
+# coding: utf-8
2
+"""
3
+``Flatisfy`` is a tool to help you find a new housing based on some criteria.
4
+"""
5
+__version__ = "0.1"

+ 176
- 0
flatisfy/__main__.py View File

@@ -0,0 +1,176 @@
1
+# coding: utf-8
2
+"""
3
+Main entry point of the Flatisfy code.
4
+"""
5
+from __future__ import absolute_import, print_function, unicode_literals
6
+
7
+import argparse
8
+import logging
9
+import sys
10
+
11
+import flatisfy.config
12
+from flatisfy import cmds
13
+from flatisfy import data
14
+from flatisfy import tools
15
+
16
+
17
+LOGGER = logging.getLogger("flatisfy")
18
+
19
+
20
+def parse_args(argv=None):
21
+    """
22
+    Create parser and parse arguments.
23
+    """
24
+    parser = argparse.ArgumentParser(prog="Flatisfy",
25
+                                     description="Find the perfect flat.")
26
+
27
+    # Parent parser containing arguments common to any subcommand
28
+    parent_parser = argparse.ArgumentParser(add_help=False)
29
+    parent_parser.add_argument(
30
+        "--data-dir",
31
+        help="Location of Flatisfy data directory."
32
+    )
33
+    parent_parser.add_argument(
34
+        "--config",
35
+        help="Configuration file to use."
36
+    )
37
+    parent_parser.add_argument(
38
+        "--passes", choices=[0, 1, 2], type=int,
39
+        help="Number of passes to do on the filtered data."
40
+    )
41
+    parent_parser.add_argument(
42
+        "--max-entries", type=int,
43
+        help="Maximum number of entries to fetch."
44
+    )
45
+    parent_parser.add_argument(
46
+        "-v", "--verbose", action="store_true",
47
+        help="Verbose logging output."
48
+    )
49
+    parent_parser.add_argument(
50
+        "-vv", action="store_true",
51
+        help="Debug logging output."
52
+    )
53
+
54
+    # Subcommands
55
+    subparsers = parser.add_subparsers(
56
+        dest="cmd", help="Available subcommands"
57
+    )
58
+
59
+    # Build data subcommand
60
+    subparsers.add_parser(
61
+        "build-data", parents=[parent_parser],
62
+        help="Build necessary data"
63
+    )
64
+
65
+    # Init config subcommand
66
+    parser_init_config = subparsers.add_parser(
67
+        "init-config", parents=[parent_parser],
68
+        help="Initialize empty configuration."
69
+    )
70
+    parser_init_config.add_argument(
71
+        "output", nargs="?", help="Output config file. Use '-' for stdout."
72
+    )
73
+
74
+    # Fetch subcommand parser
75
+    subparsers.add_parser("fetch", parents=[parent_parser],
76
+                          help="Fetch housings posts")
77
+
78
+    # Filter subcommand parser
79
+    parser_filter = subparsers.add_parser("filter", parents=[parent_parser],
80
+                                          help=(
81
+                                              "Filter housings posts. No "
82
+                                              "fetching of additional infos "
83
+                                              "is done."))
84
+    parser_filter.add_argument(
85
+        "input",
86
+        help="JSON dump of the housings post to filter."
87
+    )
88
+
89
+    # Import subcommand parser
90
+    subparsers.add_parser("import", parents=[parent_parser],
91
+                          help="Import housing posts in database.")
92
+
93
+    # Serve subcommand parser
94
+    parser_serve = subparsers.add_parser("serve", parents=[parent_parser],
95
+                                         help="Serve the web app.")
96
+    parser_serve.add_argument("--port", type=int, help="Port to bind to.")
97
+    parser_serve.add_argument("--host", help="Host to listen on.")
98
+
99
+    return parser.parse_args(argv)
100
+
101
+
102
+def main():
103
+    """
104
+    Main module code.
105
+    """
106
+    # Parse arguments
107
+    args = parse_args()
108
+
109
+    # Set logger
110
+    if args.vv:
111
+        logging.basicConfig(level=logging.DEBUG)
112
+        logging.getLogger('sqlalchemy.engine').setLevel(logging.DEBUG)
113
+    elif args.verbose:
114
+        logging.basicConfig(level=logging.INFO)
115
+        # sqlalchemy INFO level is way too loud, just stick with WARNING
116
+        logging.getLogger('sqlalchemy.engine').setLevel(logging.WARNING)
117
+    else:
118
+        logging.basicConfig(level=logging.WARNING)
119
+        logging.getLogger('sqlalchemy.engine').setLevel(logging.WARNING)
120
+
121
+    # Init-config command
122
+    if args.cmd == "init-config":
123
+        flatisfy.config.init_config(args.output)
124
+        sys.exit(0)
125
+    else:
126
+        # Load config
127
+        config = flatisfy.config.load_config(args)
128
+        if config is None:
129
+            LOGGER.error("Invalid configuration. Exiting. "
130
+                         "Run init-config before if this is the first time "
131
+                         "you run Flatisfy.")
132
+            sys.exit(1)
133
+
134
+    # Build data files
135
+    try:
136
+        if args.cmd == "build-data":
137
+            data.preprocess_data(config, force=True)
138
+            sys.exit(0)
139
+        else:
140
+            data.preprocess_data(config)
141
+    except flatisfy.exceptions.DataBuildError:
142
+        sys.exit(1)
143
+
144
+    # Fetch command
145
+    if args.cmd == "fetch":
146
+        # Fetch and filter flats list
147
+        flats_list, _ = cmds.fetch_and_filter(config)
148
+        # Sort by cost
149
+        flats_list = tools.sort_list_of_dicts_by(flats_list, "cost")
150
+
151
+        print(
152
+            tools.pretty_json(flats_list)
153
+        )
154
+    # Filter command
155
+    elif args.cmd == "filter":
156
+        # Load and filter flats list
157
+        flats_list = cmds.load_and_filter(args.input, config)
158
+        # Sort by cost
159
+        flats_list = tools.sort_list_of_dicts_by(flats_list, "cost")
160
+
161
+        print(
162
+            tools.pretty_json(flats_list)
163
+        )
164
+    # Import command
165
+    elif args.cmd == "import":
166
+        cmds.import_and_filter(config)
167
+    # Serve command
168
+    elif args.cmd == "serve":
169
+        cmds.serve(config)
170
+
171
+
172
+if __name__ == "__main__":
173
+    try:
174
+        main()
175
+    except KeyboardInterrupt:
176
+        pass

+ 110
- 0
flatisfy/cmds.py View File

@@ -0,0 +1,110 @@
1
+# coding: utf-8
2
+"""
3
+Main commands available for flatisfy.
4
+"""
5
+from __future__ import absolute_import, print_function, unicode_literals
6
+
7
+import flatisfy.filters
8
+from flatisfy import database
9
+from flatisfy.models import flat as flat_model
10
+from flatisfy import fetch
11
+from flatisfy import tools
12
+from flatisfy.web import app as web_app
13
+
14
+
15
+def fetch_and_filter(config):
16
+    """
17
+    Fetch the available flats list. Then, filter it according to criteria.
18
+
19
+    :param config: A config dict.
20
+    :return: A tuple of the list of all matching flats and the list of ignored
21
+    flats.
22
+    """
23
+    # TODO: Reduce load on housings listing websites
24
+    # Fetch flats list with flatboobs
25
+    flats_list = fetch.fetch_flats_list(config)
26
+
27
+    # Do a first pass with the available infos to try to remove as much
28
+    # unwanted postings as possible
29
+    if config["passes"] > 0:
30
+        flats_list, ignored_flats = flatisfy.filters.first_pass(flats_list,
31
+                                                                config)
32
+
33
+    # Do a second pass to consolidate all the infos we found and make use of
34
+    # additional infos
35
+    if config["passes"] > 1:
36
+        # Load additional infos
37
+        for flat in flats_list:
38
+            details = fetch.fetch_details(flat["id"])
39
+            flat = tools.merge_dicts(flat, details)
40
+
41
+        flats_list, extra_ignored_flats = flatisfy.filters.second_pass(
42
+            flats_list, config
43
+        )
44
+        ignored_flats.extend(extra_ignored_flats)
45
+
46
+    return flats_list, ignored_flats
47
+
48
+
49
+def load_and_filter(housing_file, config):
50
+    """
51
+    Load the dumped flats list. Then, filter it according to criteria.
52
+
53
+    :param housing_file: The JSON file to load flats from.
54
+    :param config: A config dict.
55
+    :return: A tuple of the list of all matching flats and the list of ignored
56
+    flats.
57
+    """
58
+    # Load flats list
59
+    flats_list = fetch.load_flats_list(housing_file)
60
+
61
+    # Do a first pass with the available infos to try to remove as much
62
+    # unwanted postings as possible
63
+    if config["passes"] > 0:
64
+        flats_list, ignored_flats = flatisfy.filters.first_pass(flats_list,
65
+                                                                config)
66
+
67
+    # Do a second pass to consolidate all the infos we found
68
+    if config["passes"] > 1:
69
+        flats_list, extra_ignored_flats = flatisfy.filters.second_pass(
70
+            flats_list, config
71
+        )
72
+        ignored_flats.extend(extra_ignored_flats)
73
+
74
+    return flats_list, ignored_flats
75
+
76
+
77
+def import_and_filter(config):
78
+    """
79
+    Fetch the available flats list. Then, filter it according to criteria.
80
+    Finally, store it in the database.
81
+
82
+    :param config: A config dict.
83
+    :return: ``None``.
84
+    """
85
+    # Fetch and filter flats list
86
+    flats_list, purged_list = fetch_and_filter(config)
87
+    # Create database connection
88
+    get_session = database.init_db(config["database"])
89
+
90
+    with get_session() as session:
91
+        for flat_dict in flats_list:
92
+            flat = flat_model.Flat.from_dict(flat_dict)
93
+            session.merge(flat)
94
+
95
+        for flat_dict in purged_list:
96
+            flat = flat_model.Flat.from_dict(flat_dict)
97
+            flat.status = flat_model.FlatStatus.purged
98
+            session.merge(flat)
99
+
100
+
101
+def serve(config):
102
+    """
103
+    Serve the web app.
104
+
105
+    :param config: A config dict.
106
+    :return: ``None``, long-running process.
107
+    """
108
+    app = web_app.get_app(config)
109
+    # TODO: Make Bottle use logging module
110
+    app.run(host=config["host"], port=config["port"])

+ 208
- 0
flatisfy/config.py View File

@@ -0,0 +1,208 @@
1
+# coding: utf-8
2
+"""
3
+This module handles the configuration management for Flatisfy.
4
+
5
+It loads the default configuration, then overloads it with the provided config
6
+file and then overloads it with command-line options.
7
+"""
8
+from __future__ import absolute_import, print_function, unicode_literals
9
+from builtins import str
10
+
11
+import json
12
+import logging
13
+import os
14
+import sys
15
+import traceback
16
+
17
+import appdirs
18
+
19
+from flatisfy import tools
20
+
21
+
22
+# Default configuration
23
+DEFAULT_CONFIG = {
24
+    # Flatboob queries to fetch
25
+    "queries": [],
26
+    # Constraints to match
27
+    "constraints": {
28
+        "postal_codes": [],  # List of postal codes
29
+        "area": (None, None),  # (min, max) in m^2
30
+        "cost": (None, None),  # (min, max) in currency unit
31
+        "rooms": (None, None),  # (min, max)
32
+        "bedrooms": (None, None),  # (min, max)
33
+        "time_to": {}  # Dict mapping names to {"gps": [lat, lng],
34
+                       #                        "time": (min, max) }
35
+                       # Time is in seconds
36
+    },
37
+    # Navitia API key
38
+    "navitia_api_key": None,
39
+    # Number of filtering passes to run
40
+    "passes": 2,
41
+    # Maximum number of entries to fetch
42
+    "max_entries": None,
43
+    # Directory in wich data will be put. ``None`` is XDG default location.
44
+    "data_directory": None,
45
+    # SQLAlchemy URI to the database to use
46
+    "database": None,
47
+    # Web app port
48
+    "port": 8080,
49
+    # Web app host to listen on
50
+    "host": "127.0.0.1"
51
+}
52
+
53
+LOGGER = logging.getLogger(__name__)
54
+
55
+
56
+def validate_config(config):
57
+    """
58
+    Check that the config passed as argument is a valid configuration.
59
+
60
+    :param config: A config dictionary to fetch.
61
+    :return: ``True`` if the configuration is valid, ``False`` otherwise.
62
+    """
63
+    def _check_constraints_bounds(bounds):
64
+        """
65
+        Check the bounds for numeric constraints.
66
+        """
67
+        assert len(bounds) == 2
68
+        assert all(
69
+            x is None or
70
+            (
71
+                (isinstance(x, int) or isinstance(x, float)) and
72
+                x >= 0
73
+            )
74
+            for x in bounds
75
+        )
76
+        if bounds[0] is not None and bounds[1] is not None:
77
+            assert bounds[1] > bounds[0]
78
+
79
+    try:
80
+        # Note: The traceback fetching code only handle single line asserts.
81
+        # Then, we disable line-too-long pylint check and E501 flake8 checks
82
+        # and use long lines whenever needed, in order to have the full assert
83
+        # message in the log output.
84
+        # pylint: disable=line-too-long
85
+        assert "postal_codes" in config["constraints"]
86
+        assert len(config["constraints"]["postal_codes"]) > 0
87
+
88
+        assert "area" in config["constraints"]
89
+        _check_constraints_bounds(config["constraints"]["area"])
90
+
91
+        assert "cost" in config["constraints"]
92
+        _check_constraints_bounds(config["constraints"]["cost"])
93
+
94
+        assert "rooms" in config["constraints"]
95
+        _check_constraints_bounds(config["constraints"]["rooms"])
96
+
97
+        assert "bedrooms" in config["constraints"]
98
+        _check_constraints_bounds(config["constraints"]["bedrooms"])
99
+
100
+        assert "time_to" in config["constraints"]
101
+        assert isinstance(config["constraints"]["time_to"], dict)
102
+        for name, item in config["constraints"]["time_to"].items():
103
+            assert isinstance(name, str)
104
+            assert "gps" in item
105
+            assert isinstance(item["gps"], list)
106
+            assert len(item["gps"]) == 2
107
+            assert "time" in item
108
+            _check_constraints_bounds(item["time"])
109
+
110
+        assert config["passes"] in [0, 1, 2]
111
+        assert config["max_entries"] is None or (isinstance(config["max_entries"], int) and config["max_entries"] > 0)  # noqa: E501
112
+
113
+        assert config["data_directory"] is None or isinstance(config["data_directory"], str)  # noqa: E501
114
+
115
+        assert config["database"] is None or isinstance(config["database"], str)  # noqa: E501
116
+
117
+        assert isinstance(config["port"], int)
118
+        assert isinstance(config["host"], str)
119
+
120
+        return True
121
+    except (AssertionError, KeyError):
122
+        _, _, exc_traceback = sys.exc_info()
123
+        return traceback.extract_tb(exc_traceback)[-1][-1]
124
+
125
+
126
+def load_config(args=None):
127
+    """
128
+    Load the configuration from file.
129
+
130
+    :param args: An argparse args structure.
131
+    :return: The loaded config dict.
132
+    """
133
+    LOGGER.info("Initializing configuration...")
134
+    # Default configuration
135
+    config_data = DEFAULT_CONFIG.copy()
136
+
137
+    # Load config from specified JSON
138
+    if args and getattr(args, "config", None):
139
+        LOGGER.debug("Loading configuration from %s.", args.config)
140
+        try:
141
+            with open(args.config, "r") as fh:
142
+                config_data.update(json.load(fh))
143
+        except (IOError, ValueError):
144
+            LOGGER.error(
145
+                "Unable to load configuration from file, "
146
+                "using default configuration."
147
+            )
148
+
149
+    # Overload config with arguments
150
+    if args and getattr(args, "passes", None) is not None:
151
+        LOGGER.debug(
152
+            "Overloading number of passes from CLI arguments: %d.",
153
+            args.passes
154
+        )
155
+        config_data["passes"] = args.passes
156
+    if args and getattr(args, "max_entries", None) is not None:
157
+        LOGGER.debug(
158
+            "Overloading maximum number of entries from CLI arguments: %d.",
159
+            args.max_entries
160
+        )
161
+        config_data["max_entries"] = args.max_entries
162
+    if args and getattr(args, "port", None) is not None:
163
+        LOGGER.debug("Overloading web app port: %d.", args.port)
164
+        config_data["port"] = args.port
165
+    if args and getattr(args, "host", None) is not None:
166
+        LOGGER.debug("Overloading web app host: %s.", args.host)
167
+        config_data["host"] = str(args.host)
168
+
169
+    # Handle data_directory option
170
+    if args and getattr(args, "data_dir", None) is not None:
171
+        LOGGER.debug("Overloading data directory from CLI arguments.")
172
+        config_data["data_directory"] = args.data_dir
173
+    elif config_data["data_directory"] is None:
174
+        config_data["data_directory"] = appdirs.user_data_dir(
175
+            "flatisfy",
176
+            "flatisfy"
177
+        )
178
+        LOGGER.debug("Using default XDG data directory: %s.",
179
+                     config_data["data_directory"])
180
+
181
+    if config_data["database"] is None:
182
+        config_data["database"] = "sqlite:///" + os.path.join(
183
+            config_data["data_directory"],
184
+            "flatisfy.db"
185
+        )
186
+
187
+    config_validation = validate_config(config_data)
188
+    if config_validation is True:
189
+        LOGGER.info("Config has been fully initialized.")
190
+        return config_data
191
+    else:
192
+        LOGGER.error("Error in configuration: %s.", config_validation)
193
+        return None
194
+
195
+
196
+def init_config(output=None):
197
+    """
198
+    Initialize an empty configuration file.
199
+
200
+    :param output: File to output content to. Defaults to ``stdin``.
201
+    """
202
+    config_data = DEFAULT_CONFIG.copy()
203
+
204
+    if output and output != "-":
205
+        with open(output, "w") as fh:
206
+            fh.write(tools.pretty_json(config_data))
207
+    else:
208
+        print(tools.pretty_json(config_data))

+ 163
- 0
flatisfy/data.py View File

@@ -0,0 +1,163 @@
1
+# coding : utf-8
2
+"""
3
+This module contains all the code related to building necessary data files from
4
+the source opendata files.
5
+"""
6
+from __future__ import absolute_import, print_function, unicode_literals
7
+
8
+import collections
9
+import json
10
+import logging
11
+import os
12
+
13
+import flatisfy.exceptions
14
+
15
+
16
+LOGGER = logging.getLogger(__name__)
17
+MODULE_DIR = os.path.dirname(os.path.realpath(__file__))
18
+
19
+
20
+def _preprocess_ratp(output_dir):
21
+    """
22
+    Build RATP file from the RATP data.
23
+
24
+    :param output_dir: Directory in which the output file should reside.
25
+    :return: ``True`` on successful build, ``False`` otherwise.
26
+    """
27
+    ratp_data_raw = []
28
+    # Load opendata file
29
+    try:
30
+        with open(os.path.join(MODULE_DIR, "data_files/ratp.json"), "r") as fh:
31
+            ratp_data_raw = json.load(fh)
32
+    except (IOError, ValueError):
33
+        LOGGER.error("Invalid raw RATP opendata file.")
34
+        return False
35
+
36
+    # Process it
37
+    ratp_data = collections.defaultdict(list)
38
+    for item in ratp_data_raw:
39
+        stop_name = item["fields"]["stop_name"].lower()
40
+        ratp_data[stop_name].append(item["fields"]["coord"])
41
+
42
+    # Output it
43
+    with open(os.path.join(output_dir, "ratp.json"), "w") as fh:
44
+        json.dump(ratp_data, fh)
45
+
46
+    return True
47
+
48
+
49
+def _preprocess_laposte(output_dir):
50
+    """
51
+    Build JSON files from the postal codes data.
52
+
53
+    :param output_dir: Directory in which the output file should reside.
54
+    :return: ``True`` on successful build, ``False`` otherwise.
55
+    """
56
+    raw_laposte_data = []
57
+    # Load opendata file
58
+    try:
59
+        with open(
60
+            os.path.join(MODULE_DIR, "data_files/laposte.json"), "r"
61
+        ) as fh:
62
+            raw_laposte_data = json.load(fh)
63
+    except (IOError, ValueError):
64
+        LOGGER.error("Invalid raw LaPoste opendata file.")
65
+        return False
66
+
67
+    # Build postal codes to other infos file
68
+    postal_codes_data = {}
69
+    for item in raw_laposte_data:
70
+        try:
71
+            postal_codes_data[item["fields"]["code_postal"]] = {
72
+                "gps": item["fields"]["coordonnees_gps"],
73
+                "nom": item["fields"]["nom_de_la_commune"].title()
74
+            }
75
+        except KeyError:
76
+            LOGGER.info("Missing data for postal code %s, skipping it.",
77
+                        item["fields"]["code_postal"])
78
+    with open(os.path.join(output_dir, "postal_codes.json"), "w") as fh:
79
+        json.dump(postal_codes_data, fh)
80
+
81
+    # Build city name to postal codes and other infos file
82
+    cities_data = {}
83
+    for item in raw_laposte_data:
84
+        try:
85
+            cities_data[item["fields"]["nom_de_la_commune"].title()] = {
86
+                "gps": item["fields"]["coordonnees_gps"],
87
+                "postal_code": item["fields"]["code_postal"]
88
+            }
89
+        except KeyError:
90
+            LOGGER.info("Missing data for city %s, skipping it.",
91
+                        item["fields"]["nom_de_la_commune"])
92
+    with open(os.path.join(output_dir, "cities.json"), "w") as fh:
93
+        json.dump(cities_data, fh)
94
+
95
+    return True
96
+
97
+
98
+def preprocess_data(config, force=False):
99
+    """
100
+    Ensures that all the necessary data files have been built from the raw
101
+    opendata files.
102
+
103
+    :params config: A config dictionary.
104
+    :params force: Whether to force rebuild or not.
105
+    """
106
+    LOGGER.debug("Data directory is %s.", config["data_directory"])
107
+    opendata_directory = os.path.join(config["data_directory"], "opendata")
108
+    try:
109
+        LOGGER.info("Ensuring the data directory exists.")
110
+        os.makedirs(opendata_directory)
111
+        LOGGER.debug("Created opendata directory at %s.", opendata_directory)
112
+    except OSError:
113
+        LOGGER.debug("Opendata directory already existed, doing nothing.")
114
+
115
+    is_built_ratp = os.path.isfile(
116
+        os.path.join(opendata_directory, "ratp.json")
117
+    )
118
+    if not is_built_ratp or force:
119
+        LOGGER.info("Building from RATP data.")
120
+        if not _preprocess_ratp(opendata_directory):
121
+            raise flatisfy.exceptions.DataBuildError("Error with RATP data.")
122
+
123
+    is_built_laposte = (
124
+        os.path.isfile(os.path.join(opendata_directory, "cities.json")) and
125
+        os.path.isfile(os.path.join(opendata_directory, "postal_codes.json"))
126
+    )
127
+    if not is_built_laposte or force:
128
+        LOGGER.info("Building from LaPoste data.")
129
+        if not _preprocess_laposte(opendata_directory):
130
+            raise flatisfy.exceptions.DataBuildError(
131
+                "Error with LaPoste data."
132
+            )
133
+
134
+
135
+def load_data(data_type, config):
136
+    """
137
+    Load a given built data file.
138
+
139
+    :param data_type: A valid data identifier.
140
+    :param config: A config dictionary.
141
+    :return: The loaded data. ``None`` if the query is incorrect.
142
+    """
143
+    if data_type not in ["postal_codes", "cities", "ratp"]:
144
+        LOGGER.error("Invalid request. No %s data file.", data_type)
145
+        return None
146
+
147
+    opendata_directory = os.path.join(config["data_directory"], "opendata")
148
+    datafile_path = os.path.join(opendata_directory, "%s.json" % data_type)
149
+    data = {}
150
+    try:
151
+        with open(datafile_path, "r") as fh:
152
+            data = json.load(fh)
153
+    except IOError:
154
+        LOGGER.error("No such data file: %s.", datafile_path)
155
+        return None
156
+    except ValueError:
157
+        LOGGER.error("Invalid JSON data file: %s.", datafile_path)
158
+        return None
159
+
160
+    if len(data) == 0:
161
+        LOGGER.warning("Loading empty data for %s.", data_type)
162
+
163
+    return data

+ 1
- 0
flatisfy/data_files/laposte.json
File diff suppressed because it is too large
View File


data/ratp.json → flatisfy/data_files/ratp.json View File


+ 64
- 0
flatisfy/database/__init__.py View File

@@ -0,0 +1,64 @@
1
+# coding: utf-8
2
+"""
3
+This module contains functions related to the database.
4
+"""
5
+from __future__ import absolute_import, print_function, unicode_literals
6
+
7
+import sqlite3
8
+
9
+from contextlib import contextmanager
10
+
11
+from sqlalchemy import event, create_engine
12
+from sqlalchemy.engine import Engine
13
+from sqlalchemy.orm import sessionmaker
14
+
15
+import flatisfy.models.flat  # noqa: F401
16
+from flatisfy.database.base import BASE
17
+
18
+
19
+@event.listens_for(Engine, "connect")
20
+def set_sqlite_pragma(dbapi_connection, _):
21
+    """
22
+    Auto enable foreign keys for SQLite.
23
+    """
24
+    # Play well with other DB backends
25
+    if isinstance(dbapi_connection, sqlite3.Connection):
26
+        cursor = dbapi_connection.cursor()
27
+        cursor.execute("PRAGMA foreign_keys=ON")
28
+        cursor.close()
29
+
30
+
31
+def init_db(database_uri=None):
32
+    """
33
+    Initialize the database, ensuring tables exist etc.
34
+
35
+    :param database_uri: An URI describing an engine to use. Defaults to
36
+    in-memory SQLite database.
37
+    :return: A tuple of an SQLAlchemy session maker and the created engine.
38
+    """
39
+    if database_uri is None:
40
+        database_uri = "sqlite:///:memory:"
41
+
42
+    engine = create_engine(database_uri)
43
+    BASE.metadata.create_all(engine, checkfirst=True)
44
+    Session = sessionmaker(bind=engine)  # pylint: disable=invalid-name
45
+
46
+    @contextmanager
47
+    def get_session():
48
+        """
49
+        Provide a transactional scope around a series of operations.
50
+
51
+        From [1].
52
+        [1]: http://docs.sqlalchemy.org/en/latest/orm/session_basics.html#when-do-i-construct-a-session-when-do-i-commit-it-and-when-do-i-close-it.
53
+        """
54
+        session = Session()
55
+        try:
56
+            yield session
57
+            session.commit()
58
+        except:
59
+            session.rollback()
60
+            raise
61
+        finally:
62
+            session.close()
63
+
64
+    return get_session

+ 10
- 0
flatisfy/database/base.py View File

@@ -0,0 +1,10 @@
1
+# coding: utf-8
2
+"""
3
+This module contains the definition of the declarative SQLAlchemy base.
4
+"""
5
+from __future__ import absolute_import, print_function, unicode_literals
6
+
7
+from sqlalchemy.ext.declarative import declarative_base
8
+
9
+
10
+BASE = declarative_base()

+ 48
- 0
flatisfy/database/types.py View File

@@ -0,0 +1,48 @@
1
+# coding: utf-8
2
+"""
3
+This modules implements custom types in SQLAlchemy.
4
+"""
5
+from __future__ import absolute_import, print_function, unicode_literals
6
+
7
+import json
8
+
9
+import sqlalchemy.types as types
10
+
11
+
12
+class StringyJSON(types.TypeDecorator):
13
+    """
14
+    Stores and retrieves JSON as TEXT for SQLite.
15
+
16
+    From
17
+    https://avacariu.me/articles/2016/compiling-json-as-text-for-sqlite-with-sqlalchemy.
18
+
19
+    .. note :: The associated field is immutable. That is, changes to the data
20
+    (typically, changing the value of a dict field) will not trigger an update
21
+    on the SQL side upon ``commit`` as the reference to the object will not
22
+    have been updated. One should force the update by forcing an update of the
23
+    reference (by performing a ``copy`` operation on the dict for instance).
24
+    """
25
+
26
+    impl = types.TEXT
27
+
28
+    def process_bind_param(self, value, dialect):
29
+        """
30
+        TODO
31
+        """
32
+        if value is not None:
33
+            value = json.dumps(value)
34
+        return value
35
+
36
+    def process_result_value(self, value, dialect):
37
+        """
38
+        TODO
39
+        """
40
+        if value is not None:
41
+            value = json.loads(value)
42
+        return value
43
+
44
+
45
+# TypeEngine.with_variant says "use StringyJSON instead when
46
+# connecting to 'sqlite'"
47
+# pylint: disable=invalid-name
48
+MagicJSON = types.JSON().with_variant(StringyJSON, 'sqlite')

+ 13
- 0
flatisfy/exceptions.py View File

@@ -0,0 +1,13 @@
1
+# coding : utf-8
2
+"""
3
+This module contains all the exceptions definitions for the Flatisfy-specific
4
+exceptions.
5
+"""
6
+from __future__ import absolute_import, print_function, unicode_literals
7
+
8
+
9
+class DataBuildError(Exception):
10
+    """
11
+    Error occurring on building a data file.
12
+    """
13
+    pass

+ 76
- 0
flatisfy/fetch.py View File

@@ -0,0 +1,76 @@
1
+# coding: utf-8
2
+"""
3
+This module contains all the code related to fetching and loading flats lists.
4
+"""
5
+from __future__ import absolute_import, print_function, unicode_literals
6
+
7
+import json
8
+import logging
9
+import subprocess
10
+
11
+
12
+LOGGER = logging.getLogger(__name__)
13
+
14
+
15
+def fetch_flats_list(config):
16
+    """
17
+    Fetch the available flats using the Flatboob / Weboob config.
18
+
19
+    :param config: A config dict.
20
+    :return: A list of all available flats.
21
+    """
22
+    flats_list = []
23
+    for query in config["queries"]:
24
+        max_entries = config["max_entries"]
25
+        if max_entries is None:
26
+            max_entries = 0
27
+
28
+        LOGGER.info("Loading flats from query %s.", query)
29
+        flatboob_output = subprocess.check_output(
30
+            ["../weboob/tools/local_run.sh", "../weboob/scripts/flatboob",
31
+             "-n", str(max_entries), "-f", "json", "load", query]
32
+        )
33
+        query_flats_list = json.loads(flatboob_output)
34
+        LOGGER.info("Fetched %d flats.", len(query_flats_list))
35
+        flats_list.extend(query_flats_list)
36
+    LOGGER.info("Fetched a total of %d flats.", len(flats_list))
37
+    return flats_list
38
+
39
+
40
+def fetch_details(flat_id):
41
+    """
42
+    Fetch the additional details for a flat using Flatboob / Weboob.
43
+
44
+    :param flat_id: ID of the flat to fetch details for.
45
+    :return: A flat dict with all the available data.
46
+    """
47
+    LOGGER.info("Loading additional details for flat %s.", flat_id)
48
+    flatboob_output = subprocess.check_output(
49
+        ["../weboob/tools/local_run.sh", "../weboob/scripts/flatboob",
50
+         "-f", "json", "info", flat_id]
51
+    )
52
+    flat_details = json.loads(flatboob_output)
53
+    LOGGER.info("Fetched details for flat %s.", flat_id)
54
+
55
+    if flat_details:
56
+        flat_details = flat_details[0]
57
+
58
+    return flat_details
59
+
60
+
61
+def load_flats_list(json_file):
62
+    """
63
+    Load a dumped flats list from JSON file.
64
+
65
+    :param json_file: The file to load housings list from.
66
+    :return: A list of all the flats in the dump file.
67
+    """
68
+    flats_list = []
69
+    try:
70
+        LOGGER.info("Loading flats list from file %s", json_file)
71
+        with open(json_file, "r") as fh:
72
+            flats_list = json.load(fh)
73
+        LOGGER.info("Found %d flats.", len(flats_list))
74
+    except (IOError, ValueError):
75
+        LOGGER.error("File %s is not a valid dump file.", json_file)
76
+    return flats_list

+ 153
- 0
flatisfy/filters/__init__.py View File

@@ -0,0 +1,153 @@
1
+# coding: utf-8
2
+"""
3
+This module contains all the filtering functions. It exposes ``first_pass`` and
4
+``second_pass`` functions which are a set of filters applied during the first
5
+pass and the second pass.
6
+"""
7
+from __future__ import absolute_import, print_function, unicode_literals
8
+
9
+import logging
10
+
11
+from flatisfy import tools
12
+from flatisfy.filters import duplicates
13
+from flatisfy.filters import metadata
14
+
15
+
16
+LOGGER = logging.getLogger(__name__)
17
+
18
+
19
+def refine_with_housing_criteria(flats_list, config):
20
+    """
21
+    Filter a list of flats according to criteria.
22
+
23
+    Housings posts websites tend to return broader results that what was
24
+    actually asked for. Then, we should filter out the list to match the
25
+    user criteria, and avoid exposing unwanted flats.
26
+
27
+    :param flats_list: A list of flats dict to filter.
28
+    :param config: A config dict.
29
+    :return: A tuple of flats to keep and flats to delete.
30
+    """
31
+    # For each flat, the associated `is_ok` value indicate whether it should be
32
+    # kept or discarded.
33
+    is_ok = [True for _ in flats_list]
34
+
35
+    for i, flat in enumerate(flats_list):
36
+        # Check postal code
37
+        postal_code = flat["flatisfy"].get("postal_code", None)
38
+        if (
39
+                postal_code and
40
+                postal_code not in config["constraints"]["postal_codes"]
41
+        ):
42
+            LOGGER.info("Postal code for flat %s is out of range.", flat["id"])
43
+            is_ok[i] = is_ok[i] and False
44
+
45
+        # Check time_to
46
+        for place_name, time in flat["flatisfy"].get("time_to", {}).items():
47
+            is_within_interval = tools.is_within_interval(
48
+                time,
49
+                *(config["constraints"]["time_to"][place_name]["time"])
50
+            )
51
+            if not is_within_interval:
52
+                LOGGER.info("Flat %s is too far from place %s.",
53
+                            flat["id"], place_name)
54
+            is_ok[i] = is_ok[i] and is_within_interval
55
+
56
+        # Check other fields
57
+        for field in ["area", "cost", "rooms", "bedrooms"]:
58
+            interval = config["constraints"][field]
59
+            is_within_interval = tools.is_within_interval(
60
+                flat.get(field, None),
61
+                *interval
62
+            )
63
+            if not is_within_interval:
64
+                LOGGER.info("%s for flat %s is out of range.",
65
+                            field.capitalize(), flat["id"])
66
+            is_ok[i] = is_ok[i] and is_within_interval
67
+
68
+    return (
69
+        [
70
+            flat
71
+            for i, flat in enumerate(flats_list)
72
+            if is_ok[i]
73
+        ],
74
+        [
75
+            flat
76
+            for i, flat in enumerate(flats_list)
77
+            if not is_ok[i]
78
+        ]
79
+    )
80
+
81
+
82
+def first_pass(flats_list, config):
83
+    """
84
+    First filtering pass.
85
+
86
+    Flatboob only fetches data from the listing of the available housing. Then,
87
+    we should do a first pass to filter based on the already available data and
88
+    only request more data for the remaining housings.
89
+
90
+    :param flats_list: A list of flats dict to filter.
91
+    :param config: A config dict.
92
+    :return: A tuple of processed flats and purged flats.
93
+    """
94
+    LOGGER.info("Running first filtering pass.")
95
+    # Handle duplicates based on ids
96
+    # Just remove them (no merge) as they should be the exact same object.
97
+    flats_list = duplicates.detect(
98
+        flats_list, key="id", merge=False
99
+    )
100
+    # Also merge duplicates based on url (these may come from different
101
+    # flatboob backends)
102
+    # This is especially useful as some websites such as entreparticuliers
103
+    # contains a lot of leboncoin housings posts.
104
+    flats_list = duplicates.detect(
105
+        flats_list, key="url", merge=True
106
+    )
107
+
108
+    # Add the flatisfy metadata entry
109
+    flats_list = metadata.init(flats_list)
110
+    # Guess the postal codes
111
+    flats_list = metadata.guess_postal_code(flats_list, config)
112
+    # Try to match with stations
113
+    flats_list = metadata.guess_stations(flats_list, config)
114
+    # Remove returned housing posts that do not match criteria
115
+    flats_list, purged_list = refine_with_housing_criteria(flats_list, config)
116
+
117
+    return (flats_list, purged_list)
118
+
119
+
120
+def second_pass(flats_list, config):
121
+    """
122
+    Second filtering pass.
123
+
124
+    This pass is expected to have as most information as possible on the
125
+    available housings. Plus it runs after first pass which already
126
+    consolidated data.
127
+
128
+    It should consolidate everything and try to extract as many data as
129
+    possible from the fetched housings.
130
+
131
+    :param flats_list: A list of flats dict to filter.
132
+    :param config: A config dict.
133
+    :return: A tuple of processed flats and purged flats.
134
+    """
135
+    LOGGER.info("Running second filtering pass.")
136
+    # Assumed to run after first pass, so there should be no obvious duplicates
137
+    # left and we already tried to find postal code and nearby stations.
138
+
139
+    # Confirm postal code
140
+    flats_list = metadata.guess_postal_code(flats_list, config)
141
+
142
+    # TODO: Guess the address
143
+
144
+    # Better match with stations (confirm and check better)
145
+    flats_list = metadata.guess_stations(flats_list, config)
146
+
147
+    # Compute travel time to specified points
148
+    flats_list = metadata.compute_travel_times(flats_list, config)
149
+
150
+    # Remove returned housing posts that do not match criteria
151
+    flats_list, purged_list = refine_with_housing_criteria(flats_list, config)
152
+
153
+    return (flats_list, purged_list)

+ 56
- 0
flatisfy/filters/duplicates.py View File

@@ -0,0 +1,56 @@
1
+# coding: utf-8
2
+"""
3
+Filtering functions to detect and merge duplicates.
4
+"""
5
+from __future__ import absolute_import, print_function, unicode_literals
6
+
7
+import collections
8
+
9
+from flatisfy import tools
10
+
11
+
12
+def detect(flats_list, key="id", merge=True):
13
+    """
14
+    Detect obvious duplicates within a given list of flats.
15
+
16
+    There may be duplicates found, as some queries could overlap (especially
17
+    since when asking for a given place, websites tend to return housings in
18
+    nearby locations as well). We need to handle them, by either deleting the
19
+    duplicates (``merge=False``) or merging them together in a single flat
20
+    object.
21
+
22
+    :param flats_list: A list of flats dicts.
23
+    :param key: The flat dicts key on which the duplicate detection should be
24
+    done.
25
+    :param merge: Whether the found duplicates should be merged or we should
26
+    only keep one of them.
27
+
28
+    :return: A deduplicated list of flat dicts.
29
+    """
30
+    # TODO: Keep track of found duplicates?
31
+    # ``seen`` is a dict mapping aggregating the flats by the deduplication
32
+    # keys. We basically make buckets of flats for every key value. Flats in
33
+    # the same bucket should be merged together afterwards.
34
+    seen = collections.defaultdict(list)
35
+    for flat in flats_list:
36
+        seen[flat.get(key, None)].append(flat)
37
+
38
+    # Generate the unique flats list based on these buckets
39
+    unique_flats_list = []
40
+    for flat_key, matching_flats in seen.items():
41
+        if flat_key is None:
42
+            # If the key is None, it means Weboob could not load the data. In
43
+            # this case, we consider every matching item as being independant
44
+            # of the others, to avoid over-deduplication.
45
+            unique_flats_list.extend(matching_flats)
46
+        else:
47
+            # Otherwise, check the policy
48
+            if merge:
49
+                # If a merge is requested, do the merge
50
+                unique_flats_list.append(
51
+                    tools.merge_dicts(*matching_flats)
52
+                )
53
+            else:
54
+                # Otherwise, just keep any of them
55
+                unique_flats_list.append(matching_flats[0])
56
+    return unique_flats_list

+ 349
- 0
flatisfy/filters/metadata.py View File

@@ -0,0 +1,349 @@
1
+# coding: utf-8
2
+"""
3
+Filtering functions to handle flatisfy-specific metadata.
4
+
5
+This includes functions to guess metadata (postal codes, stations) from the
6
+actual fetched data.
7
+"""
8
+from __future__ import absolute_import, print_function, unicode_literals
9
+
10
+import logging
11
+import re
12
+
13
+from flatisfy import data
14
+from flatisfy import tools
15
+
16
+
17
+LOGGER = logging.getLogger(__name__)
18
+
19
+
20
+def init(flats_list):
21
+    """
22
+    Create a flatisfy key containing a dict of metadata fetched by flatisfy for
23
+    each flat in the list.
24
+
25
+    :param flats_list: A list of flats dict.
26
+    :return: The updated list
27
+    """
28
+    for flat in flats_list:
29
+        if "flatisfy" not in flat:
30
+            flat["flatisfy"] = {}
31
+    return flats_list
32
+
33
+
34
+def fuzzy_match(query, choices, limit=3, threshold=75):
35
+    """
36
+    Custom search for the best element in choices matching the query.
37
+
38
+    :param query: The string to match.
39
+    :param choices: The list of strings to match with.
40
+    :param limit: The maximum number of items to return.
41
+    :param threshold: The score threshold to use.
42
+
43
+    :return: Tuples of matching items and associated confidence.
44
+
45
+    .. note :: This function works by removing any fancy character from the
46
+    ``query`` and ``choices`` strings (replacing any non alphabetic and non
47
+    numeric characters by space), converting to lower case and normalizing them
48
+    (collapsing multiple spaces etc). It also converts any roman numerals to
49
+    decimal system. It then compares the string and look for the longest string
50
+    in ``choices`` which is a substring of ``query``. The longest one gets a
51
+    confidence of 100. The shorter ones get a confidence proportional to their
52
+    length.
53
+
54
+    .. seealso :: flatisfy.tools.normalize_string
55
+
56
+    .. todo :: Is there a better confidence measure?
57
+
58
+    :Example:
59
+
60
+        >>> match("Paris 14ème", ["Ris", "ris", "Paris 14"], limit=1)
61
+        [("Paris 14", 100)
62
+
63
+        >>> match( \
64
+                "Saint-Jacques, Denfert-Rochereau (Colonel Rol-Tanguy), " \
65
+                "Mouton-Duvernet", \
66
+                ["saint-jacques", "denfert rochereau", "duvernet", "toto"], \
67
+                limit=4 \
68
+            )
69
+        [('denfert rochereau', 100), ('saint-jacques', 76)]
70
+    """
71
+    normalized_query = tools.normalize_string(query)
72
+    normalized_choices = [tools.normalize_string(choice) for choice in choices]
73
+
74
+    # Remove duplicates in the choices list
75
+    unique_normalized_choices = tools.uniqify(normalized_choices)
76
+
77
+    # Get the matches (normalized strings)
78
+    # Keep only ``limit`` matches.
79
+    matches = sorted(
80
+        [
81
+            (choice, len(choice))
82
+            for choice in tools.uniqify(unique_normalized_choices)
83
+            if choice in normalized_query
84
+        ],
85
+        key=lambda x: x[1],
86
+        reverse=True
87
+    )[:limit]
88
+
89
+    # Update confidence
90
+    if matches:
91
+        max_confidence = max(match[1] for match in matches)
92
+        matches = [
93
+            (x[0], int(x[1] / max_confidence * 100))
94
+            for x in matches
95
+        ]
96
+
97
+    # Convert back matches to original strings
98
+    # Also filter out matches below threshold
99
+    matches = [
100
+        (choices[normalized_choices.index(x[0])], x[1])
101
+        for x in matches
102
+        if x[1] >= threshold
103
+    ]
104
+
105
+    return matches
106
+
107
+
108
+def guess_postal_code(flats_list, config, distance_threshold=20000):
109
+    """
110
+    Try to guess the postal code from the location of the flats.
111
+
112
+    :param flats_list: A list of flats dict.
113
+    :param config: A config dict.
114
+    :param distance_threshold: Maximum distance in meters between the
115
+    constraint postal codes (from config) and the one found by this function,
116
+    to avoid bad fuzzy matching. Can be ``None`` to disable thresholding.
117
+
118
+    :return: An updated list of flats dict with guessed postal code.
119
+    """
120
+    opendata = {
121
+        "cities": data.load_data("cities", config),
122
+        "postal_codes": data.load_data("postal_codes", config)
123
+    }
124
+
125
+    for flat in flats_list:
126
+        location = flat.get("location", None)
127
+        if not location:
128
+            # Skip everything if empty location
129
+            LOGGER.info(
130
+                (
131
+                    "No location field for flat %s, skipping postal "
132
+                    "code lookup."
133
+                ),
134
+                flat["id"]
135
+            )
136
+            continue
137
+
138
+        postal_code = None
139
+        # Try to find a postal code directly
140
+        try:
141
+            postal_code = re.search(r"[0-9]{5}", location)
142
+            assert postal_code is not None
143
+            postal_code = postal_code.group(0)
144
+
145
+            # Check the postal code is within the db
146
+            assert postal_code in opendata["postal_codes"]
147
+
148
+            LOGGER.info(
149
+                "Found postal code in location field for flat %s: %s.",
150
+                flat["id"], postal_code
151
+            )
152
+        except AssertionError as e:
153
+            postal_code = None
154
+
155
+        # If not found, try to find a city
156
+        if not postal_code:
157
+            matched_city = fuzzy_match(
158
+                location,
159
+                opendata["cities"].keys(),
160
+                limit=1
161
+            )
162
+            if matched_city:
163
+                # Store the matching postal code
164
+                matched_city = matched_city[0]
165
+                matched_city_name = matched_city[0]
166
+                postal_code = (
167
+                    opendata["cities"][matched_city_name]["postal_code"]
168
+                )
169
+                LOGGER.info(
170
+                    ("Found postal code in location field through city lookup "
171
+                     "for flat %s: %s."),
172
+                    flat["id"], postal_code
173
+                )
174
+
175
+        # Check that postal code is not too far from the ones listed in config,
176
+        # limit bad fuzzy matching
177
+        if postal_code and distance_threshold:
178
+            distance = min(
179
+                tools.distance(
180
+                    opendata["postal_codes"][postal_code]["gps"],
181
+                    opendata["postal_codes"][constraint]["gps"],
182
+                )
183
+                for constraint in config["constraints"]["postal_codes"]
184
+            )
185
+
186
+            if distance > distance_threshold:
187
+                LOGGER.info(
188
+                    ("Postal code %s found for flat %s is off-constraints. "
189
+                     "Min distance is %f."),
190
+                    postal_code, flat["id"], distance
191
+                )
192
+                postal_code = None
193
+
194
+        # Store it
195
+        if postal_code:
196
+            existing_postal_code = flat["flatisfy"].get("postal_code", None)
197
+            if existing_postal_code and existing_postal_code != postal_code:
198
+                LOGGER.warning(
199
+                    "Replacing previous postal code %s by %s for flat %s.",
200
+                    existing_postal_code, postal_code, flat["id"]
201
+                )
202
+            flat["flatisfy"]["postal_code"] = postal_code
203
+        else:
204
+            LOGGER.info("No postal code found for flat %s.", flat["id"])
205
+
206
+    return flats_list
207
+
208
+
209
+def guess_stations(flats_list, config, distance_threshold=1500):
210
+    """
211
+    Try to match the station field with a list of available stations nearby.
212
+
213
+    :param flats_list: A list of flats dict.
214
+    :param config: A config dict.
215
+    :param distance_threshold: Maximum distance (in meters) between the center
216
+    of the postal code and the station to consider it ok.
217
+
218
+    :return: An updated list of flats dict with guessed nearby stations.
219
+    """
220
+    opendata = {
221
+        "postal_codes": data.load_data("postal_codes", config),
222
+        "stations": data.load_data("ratp", config)
223
+    }
224
+
225
+    for flat in flats_list:
226
+        flat_station = flat.get("station", None)
227
+        # TODO: Use flat location field as well?
228
+
229
+        if not flat_station:
230
+            # Skip everything if empty station
231
+            LOGGER.info(
232
+                "No station field for flat %s, skipping stations lookup.",
233
+                flat["id"]
234
+            )
235
+            continue
236
+
237
+        matched_stations = fuzzy_match(
238
+            flat_station,
239
+            opendata["stations"].keys(),
240
+            limit=10,
241
+            threshold=50
242
+        )
243
+
244
+        # Filter out the stations that are obviously too far and not well
245
+        # guessed
246
+        good_matched_stations = []
247
+        postal_code = flat["flatisfy"].get("postal_code", None)
248
+        if postal_code:
249
+            # If there is a postal code, check that the matched station is
250
+            # closed to it
251
+            postal_code_gps = opendata["postal_codes"][postal_code]["gps"]
252
+            for station in matched_stations:
253
+                # opendata["stations"] is a dict mapping station names to list
254
+                # of coordinates, for efficiency. Note that multiple stations
255
+                # with the same name exist in a city, hence the list of
256
+                # coordinates.
257
+                for station_gps in opendata["stations"][station[0]]:
258
+                    distance = tools.distance(station_gps, postal_code_gps)
259
+                    if distance < distance_threshold:
260
+                        # If at least one of the coordinates for a given
261
+                        # station is close enough, that's ok and we can add
262
+                        # the station
263
+                        good_matched_stations.append({
264
+                            "name": station[0],
265
+                            "confidence": station[1],
266
+                            "gps": station_gps
267
+                        })
268
+                        break
269
+                    LOGGER.debug(
270
+                        "Station %s is too far from flat %s, discarding it.",
271
+                        station[0], flat["id"]
272
+                    )
273
+        else:
274
+            LOGGER.info(
275
+                ("No postal code for flat %s, keeping all the matched "
276
+                 "stations with half confidence."),
277
+                flat["id"]
278
+            )
279
+            # Otherwise, we keep every matching station but with half
280
+            # confidence
281
+            good_matched_stations = [
282
+                {
283
+                    "name": station[0],
284
+                    "confidence": station[1] * 0.5,
285
+                    "gps": station_gps
286
+                }
287
+                for station in matched_stations
288
+                for station_gps in opendata["stations"][station[0]]
289
+            ]
290
+
291
+        # Store matched stations and the associated confidence
292
+        LOGGER.info(
293
+            "Found stations for flat %s: %s.",
294
+            flat["id"],
295
+            ", ".join(x["name"] for x in good_matched_stations)
296
+        )
297
+        # TODO: Handle update (second pass)
298
+        flat["flatisfy"]["matched_stations"] = good_matched_stations
299
+
300
+    return flats_list
301
+
302
+
303
+def compute_travel_times(flats_list, config):
304
+    """
305
+    Compute the travel time between each flat and the points listed in the
306
+    constraints.
307
+
308
+    :param flats_list: A list of flats dict.
309
+    :param config: A config dict.
310
+
311
+    :return: An updated list of flats dict with computed travel times.
312
+
313
+    .. note :: Requires a Navitia or CityMapper API key in the config.
314
+    """
315
+    for flat in flats_list:
316
+        if not flat["flatisfy"].get("matched_stations", []):
317
+            # Skip any flat without matched stations
318
+            LOGGER.info(
319
+                "Skipping travel time computation for flat %s. No matched "
320
+                "stations.",
321
+                flat["id"]
322
+            )
323
+            continue
324
+
325
+        if "time_to" not in flat["flatisfy"]:
326
+            # Ensure time_to key is initialized
327
+            flat["flatisfy"]["time_to"] = {}
328
+
329
+        # For each place, loop over the stations close to the flat, and find
330
+        # the minimum travel time.
331
+        for place_name, place in config["constraints"]["time_to"].items():
332
+            time_to_place = None
333
+            for station in flat["flatisfy"]["matched_stations"]:
334
+                time_from_station = tools.get_travel_time_between(
335
+                    station["gps"],
336
+                    place["gps"],
337
+                    config
338
+                )
339
+                if time_from_station and (time_from_station < time_to_place or
340
+                                          time_to_place is None):
341
+                    time_to_place = time_from_station
342
+
343
+            if time_to_place:
344
+                LOGGER.info(
345
+                    "Travel time between %s and flat %s is %ds.",
346
+                    place_name, flat["id"], time_to_place
347
+                )
348
+                flat["flatisfy"]["time_to"][place_name] = time_to_place
349
+    return flats_list

+ 0
- 0
flatisfy/models/__init__.py View File


+ 101
- 0
flatisfy/models/flat.py View File

@@ -0,0 +1,101 @@
1
+# coding: utf-8
2
+"""
3
+This modules defines an SQLAlchemy ORM model for a flat.
4
+"""
5
+# pylint: disable=invalid-name,too-few-public-methods
6
+from __future__ import absolute_import, print_function, unicode_literals
7
+
8
+import enum
9
+
10
+from sqlalchemy import Column, DateTime, Enum, Float, String, Text
11
+
12
+from flatisfy.database.base import BASE
13
+from flatisfy.database.types import MagicJSON
14
+
15
+
16
+class FlatStatus(enum.Enum):
17
+    """
18
+    An enum of the possible status for a flat entry.
19
+    """
20
+    purged = -10
21
+    new = 0
22
+    contacted = 10
23
+    answer_no = 20
24
+    answer_yes = 21
25
+
26
+
27
+class Flat(BASE):
28
+    """
29
+    SQLAlchemy ORM model to store a flat.
30
+    """
31
+    __tablename__ = "flats"
32
+
33
+    # Weboob data
34
+    id = Column(String, primary_key=True)
35
+    area = Column(Float)
36
+    bedrooms = Column(Float)
37
+    cost = Column(Float)
38
+    currency = Column(String)
39
+    date = Column(DateTime)
40
+    details = Column(MagicJSON)
41
+    location = Column(String)
42
+    phone = Column(String)
43
+    photos = Column(MagicJSON)
44
+    rooms = Column(Float)
45
+    station = Column(String)
46
+    text = Column(Text)
47
+    title = Column(String)
48
+    url = Column(String)
49
+
50
+    # Flatisfy data
51
+    # TODO: Should be in another table with relationships
52
+    flatisfy_stations = Column(MagicJSON)
53
+    flatisfy_postal_code = Column(String)
54
+    flatisfy_time_to = Column(MagicJSON)
55
+
56
+    # Status
57
+    status = Column(Enum(FlatStatus), default=FlatStatus.new)
58
+
59
+    @staticmethod
60
+    def from_dict(flat_dict):
61
+        """
62
+        Create a Flat object from a flat dict as manipulated by the filtering
63
+        pass.
64
+        """
65
+        # Handle flatisfy metadata
66
+        flat_dict = flat_dict.copy()
67
+        flat_dict["flatisfy_stations"] = (
68
+            flat_dict["flatisfy"].get("matched_stations", None)
69
+        )
70
+        flat_dict["flatisfy_postal_code"] = (
71
+            flat_dict["flatisfy"].get("postal_code", None)
72
+        )
73
+        flat_dict["flatisfy_time_to"] = (
74
+            flat_dict["flatisfy"].get("time_to", None)
75
+        )
76
+        del flat_dict["flatisfy"]
77
+
78
+        # Handle date field
79
+        flat_dict["date"] = None  # TODO
80
+
81
+        flat_object = Flat()
82
+        flat_object.__dict__.update(flat_dict)
83
+        return flat_object
84
+
85
+    def __repr__(self):
86
+        return "<Flat(id=%s, url=%s)>" % (self.id, self.url)
87
+
88
+
89
+    def json_api_repr(self):
90
+        """
91
+        Return a dict representation of this flat object that is JSON
92
+        serializable.
93
+        """
94
+        flat_repr = {
95
+            k: v
96
+            for k, v in self.__dict__.items()
97
+            if not k.startswith("_")
98
+        }
99
+        flat_repr["status"] = str(flat_repr["status"])
100
+
101
+        return flat_repr

+ 239
- 0
flatisfy/tools.py View File

@@ -0,0 +1,239 @@
1
+# coding: utf-8
2
+"""
3
+This module contains basic utility functions, such as pretty printing of JSON
4
+output, checking that a value is within a given interval etc.
5
+"""
6
+from __future__ import (
7
+    absolute_import, division, print_function, unicode_literals
8
+)
9
+
10
+import datetime
11
+import json
12
+import logging
13
+import math
14
+import re
15
+
16
+import requests
17
+import unidecode
18
+
19
+
20
+LOGGER = logging.getLogger(__name__)
21
+
22
+
23
+def pretty_json(data):
24
+    """
25
+    Pretty JSON output.
26
+
27
+    :param data: The data to dump as pretty JSON.
28
+    :return: The pretty printed JSON dump.
29
+
30
+    :Example:
31
+
32
+        >>> print(pretty_json({"toto": "ok", "foo": "bar"}))
33
+        {
34
+            "foo": "bar",
35
+            "toto": "ok"
36
+        }
37
+    """
38
+    return json.dumps(data, indent=4, separators=(',', ': '),
39
+                      sort_keys=True)
40
+
41
+
42
+def is_within_interval(value, min_value=None, max_value=None):
43
+    """
44
+    Check whether a variable is within a given interval. Assumes the value is
45
+    always ok with respect to a `None` bound. If the `value` is `None`, it is
46
+    always within the bounds.
47
+
48
+    :param value: The value to check. Can be ``None``.
49
+    :param min_value: The lower bound.
50
+    :param max_value: The upper bound.
51
+    :return: ``True`` if the value is ``None``. ``True`` or ``False`` whether
52
+    the value is within the given interval or not.
53
+
54
+    .. note:: A value is always within a ``None`` bound.
55
+
56
+    :Example:
57
+
58
+        >>> is_within_interval(None)
59
+        True
60
+        >>> is_within_interval(None, 0, 10)
61
+        True
62
+        >>> is_within_interval(2, None, None)
63
+        True
64
+        >>> is_within_interval(2, None, 3)
65
+        True
66
+        >>> is_within_interval(2, 1, None)
67
+        True
68
+        >>> is_within_interval(2, 1, 3)
69
+        True
70
+        >>> is_within_interval(2, 4, 7)
71
+        False
72
+        >>> is_within_interval(2, 4, 1)
73
+        False
74
+    """
75
+    checks = []
76
+    if value and min_value:
77
+        checks.append(value >= min_value)
78
+    if value and max_value:
79
+        checks.append(value <= max_value)
80
+    return all(checks)
81
+
82
+
83
+def normalize_string(string):
84
+    """
85
+    Normalize the given string for matching.
86
+
87
+    .. todo :: Convert romanian numerals to decimal
88
+
89
+    :Example:
90
+
91
+        >>> normalize_string("tétéà 14ème-XIV,  foobar")
92
+        'tetea 14eme xiv, foobar'
93
+    """
94
+    # ASCIIfy the string
95
+    string = unidecode.unidecode(string)
96
+
97
+    # Replace any non-alphanumeric character by space
98
+    # Keep some basic punctuation to keep syntaxic units
99
+    string = re.sub(r"[^a-zA-Z0-9,;:]", " ", string)
100
+
101
+    # Convert to lowercase
102
+    string = string.lower()
103
+
104
+    # Collapse multiple spaces, replace tabulations and newlines by space
105
+    string = re.sub(r"\s+", " ", string)
106
+
107
+    return string
108
+
109
+
110
+def uniqify(some_list):
111
+    """
112
+    Filter out duplicates from a given list.
113
+
114
+    :Example:
115
+
116
+        >>> uniqify([1, 2, 2, 3])
117
+        [1, 2, 3]
118
+    """
119
+    return list(set(some_list))
120
+
121
+
122
+def distance(gps1, gps2):
123
+    """
124
+    Compute the distance between two tuples of latitude and longitude.
125
+
126
+    :param gps1: First tuple of (latitude, longitude).
127
+    :param gps2: Second tuple of (latitude, longitude).
128
+    :return: The distance in meters.
129
+
130
+    :Example:
131
+
132
+        >>> int(distance([48.86786647303717, 2.19368117495212], \
133
+                         [48.95314107920405, 2.3368043817358464]))
134
+        14117
135
+    """
136
+    lat1 = math.radians(gps1[0])
137
+    long1 = math.radians(gps1[1])
138
+
139
+    lat2 = math.radians(gps2[0])
140
+    long2 = math.radians(gps2[1])
141
+
142
+    # pylint: disable=invalid-name
143
+    a = (
144
+        math.sin((lat2 - lat1) / 2.0)**2 +
145
+        math.cos(lat1) * math.cos(lat2) * math.sin((long2 - long1) / 2.0)**2
146
+    )
147
+    c = 2.0 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
148
+    earth_radius = 6371000
149
+
150
+    return earth_radius * c
151
+
152
+
153
+def sort_list_of_dicts_by(flats_list, key):
154
+    """
155
+    Sort a list of dicts according to a given field common to all the dicts.
156
+
157
+    :param flats_list: List of dicts to sort.
158
+    :param key: The key of the dict items to sort on.
159
+    :return: A sorted list.
160
+
161
+    :Example:
162
+
163
+        >>> sort_list_of_dicts_by([{1: 2}, {1: 1}], 1)
164
+        [{1: 1}, {1: 2}]
165
+    """
166
+    return sorted(flats_list, key=lambda x: x[key])
167
+
168
+
169
+def merge_dicts(*args):
170
+    """
171
+    Merge the two flats passed as argument in a single flat dict object.
172
+    """
173
+    if len(args) == 1:
174
+        return args[0]
175
+    else:
176
+        flat1, flat2 = args[:2]
177
+        merged_flat = {}
178
+        for k, value2 in flat2.items():
179
+            value1 = flat1.get(k, None)
180
+            if value1 is None:
181
+                # flat1 has empty matching field, just keep the flat2 field
182
+                merged_flat[k] = value2
183
+            elif value2 is None:
184
+                # flat2 field is empty, just keep the flat1 field
185
+                merged_flat[k] = value1
186
+            else:
187
+                # Any other case, we should merge
188
+                # TODO: Do the merge
189
+                merged_flat[k] = value1
190
+        return merge_dicts(merged_flat, *args[2:])
191
+
192
+
193
+def get_travel_time_between(latlng_from, latlng_to, config):
194
+    """
195
+    Query the Navitia API to get the travel time between two points identified
196
+    by their latitude and longitude.
197
+
198
+    :param latlng_from: A tuple of (latitude, longitude) for the starting
199
+    point.
200
+    :param latlng_to: A tuple of (latitude, longitude) for the destination.
201
+    :return: The travel time in seconds. Returns ``None`` if it could not fetch
202
+    it.
203
+
204
+    .. note :: Uses the Navitia API. Requires a ``navitia_api_key`` field to be
205
+    filled-in in the ``config``.
206
+    """
207
+    NAVITIA_ENDPOINT = "https://api.navitia.io/v1/coverage/fr-idf/journeys"
208
+    time = None
209
+
210
+    # Check that Navitia API key is available
211
+    if config["navitia_api_key"]:
212
+        payload = {
213
+            "from": "%s;%s" % (latlng_from[1], latlng_from[0]),
214
+            "to": "%s;%s" % (latlng_to[1], latlng_to[0]),
215
+            "datetime": datetime.datetime.now().isoformat(),
216
+            "count": 1
217
+        }
218
+        try:
219
+            # Do the query to Navitia API
220
+            req = requests.get(
221
+                NAVITIA_ENDPOINT, params=payload,
222
+                auth=(config["navitia_api_key"], "")
223
+            )
224
+            req.raise_for_status()
225
+            time = req.json()["journeys"][0]["durations"]["total"]
226
+        except (requests.exceptions.RequestException,
227
+                ValueError, IndexError, KeyError) as e:
228
+            # Ignore any possible exception
229
+            LOGGER.warning(
230
+                "An exception occurred during travel time lookup on "
231
+                "Navitia: %s.",
232
+                str(e)
233
+            )
234
+    else:
235
+        LOGGER.warning(
236
+            "No API key available for travel time lookup. Please provide "
237
+            "a Navitia API key. Skipping travel time lookup."
238
+        )
239
+    return time

+ 0
- 0
flatisfy/web/__init__.py View File


+ 53
- 0
flatisfy/web/app.py View File

@@ -0,0 +1,53 @@
1
+# coding: utf-8
2
+"""
3
+This module contains the definition of the Bottle web app.
4
+"""
5
+from __future__ import (
6
+    absolute_import, division, print_function, unicode_literals
7
+)
8
+
9
+import os
10
+
11
+import bottle
12
+
13
+from flatisfy import database
14
+from flatisfy.web.routes import api as api_routes
15
+from flatisfy.web.dbplugin import DatabasePlugin
16
+
17
+
18
+def _serve_static_file(filename):
19
+    """
20
+    Helper function to serve static file.
21
+    """
22
+    return bottle.static_file(
23
+        filename,
24
+        root=os.path.join(
25
+            os.path.dirname(os.path.realpath(__file__)),
26
+            "static"
27
+        )
28
+    )
29
+
30
+
31
+def get_app(config):
32
+    """
33
+    Get a Bottle app instance with all the routes set-up.
34
+
35
+    :return: The built bottle app.
36
+    """
37
+    get_session = database.init_db(config["database"])
38
+
39
+    app = bottle.default_app()
40
+    app.install(DatabasePlugin(get_session))
41
+
42
+    # API v1 routes
43
+    app.route("/api/v1/", "GET", api_routes.index_v1)
44
+    app.route("/api/v1/flats", "GET", api_routes.flats_v1)
45
+    app.route("/api/v1/flat/:id", "GET", api_routes.flat_v1)
46
+
47
+    # Index
48
+    app.route("/", "GET", lambda: _serve_static_file("index.html"))
49
+
50
+    # Static files
51
+    app.route("/static/<filename:path>", "GET", _serve_static_file)
52
+
53
+    return app

+ 58
- 0
flatisfy/web/dbplugin.py View File

@@ -0,0 +1,58 @@
1
+# coding: utf-8
2
+"""
3
+This module contains a Bottle plugin to pass the database argument to any route
4
+which needs it.
5
+"""
6
+from __future__ import (
7
+    absolute_import, division, print_function, unicode_literals
8
+)
9
+
10
+import functools
11
+import inspect
12
+
13
+import bottle
14
+
15
+
16
+class DatabasePlugin(object):
17
+    name = 'database'
18
+    api = 2
19
+    KEYWORD = "db"
20
+
21
+    def __init__(self, get_session):
22
+        """
23
+        :param keyword: Keyword used to inject session database in a route
24
+        :param create_session: SQLAlchemy session maker created with the
25
+                'sessionmaker' function. Will create its own if undefined.
26
+        """
27
+        self.get_session = get_session
28
+
29
+    def setup(self, app):
30
+        """
31
+        Make sure that other installed plugins don't affect the same
32
+        keyword argument and check if metadata is available.
33
+        """
34
+        for other in app.plugins:
35
+            if not isinstance(other, DatabasePlugin):
36
+                continue
37
+            else:
38
+                raise bottle.PluginError(
39
+                    "Found another conflicting Database plugin."
40
+                )
41
+
42
+    def apply(self, callback, route):
43
+        try:
44
+            callback_args = inspect.signature(route.callback).parameters
45
+        except AttributeError:
46
+            # inspect.signature does not exist on older Python
47
+            callback_args = inspect.getargspec(route.callback).args
48
+
49
+        if self.KEYWORD not in callback_args:
50
+            return callback
51
+        else:
52
+            with self.get_session() as session:
53
+                kwargs = {}
54
+                kwargs[self.KEYWORD] = session
55
+                return functools.partial(callback, **kwargs)
56
+
57
+
58
+Plugin = DatabasePlugin

+ 0
- 0
flatisfy/web/routes/__init__.py View File


+ 47
- 0
flatisfy/web/routes/api.py View File

@@ -0,0 +1,47 @@
1
+# coding: utf-8
2
+"""
3
+This module contains the definition of the web app API routes.
4
+"""
5
+from __future__ import (
6
+    absolute_import, division, print_function, unicode_literals
7
+)
8
+
9
+from flatisfy.models import flat as flat_model
10
+
11
+
12
+def index_v1():
13
+    """
14
+    API v1 index route:
15
+
16
+        GET /api/v1/
17
+    """
18
+    return {
19
+        "flats": "/api/v1/flats"
20
+    }
21
+
22
+
23
+def flats_v1(db):
24
+    """
25
+    API v1 flats route:
26
+
27
+        GET /api/v1/flats
28
+    """
29
+    flats = [
30
+        flat.json_api_repr()
31
+        for flat in db.query(flat_model.Flat).all()
32
+    ]
33
+    return {
34
+        "data": flats
35
+    }
36
+
37
+
38
+def flat_v1(id, db):
39
+    """
40
+    API v1 flat route:
41
+
42
+        GET /api/v1/flat/:id
43
+    """
44
+    flat = db.query(flat_model.Flat).filter_by(id=id).first()
45
+    return {
46
+        "data": flat.json_api_repr()
47
+    }

+ 30
- 0
flatisfy/web/static/index.html View File

@@ -0,0 +1,30 @@
1
+<!doctype html>
2
+<html lang="fr">
3
+    <head>
4
+        <meta charset="utf-8">
5
+        <title>Flatisfy</title>
6
+        <script src="https://unpkg.com/vue"></script>
7
+    </head>
8
+    <body>
9
+        <div id="app">
10
+            <h1>Flatisfy</h1>
11
+            <table>
12
+                <thead>