Browse Source

Merge branch 'bagage/Flatisfy-master'

Phyks (Lucas Verney) 8 months ago
parent
commit
1a95495c30
59 changed files with 873794 additions and 1810 deletions
  1. 1
    0
      .dockerignore
  2. 10
    0
      .editorconfig
  3. 3
    0
      .eslintrc
  4. 0
    2
      .gitignore
  5. 6
    0
      .vscode/extensions.json
  6. 15
    0
      .vscode/settings.json
  7. 7
    1
      doc/0.getting_started.md
  8. 47
    41
      doc/conf.py
  9. 1
    1
      docker/Dockerfile
  10. 1
    0
      docker/docker-compose.yml
  11. 60
    88
      flatisfy/__main__.py
  12. 52
    48
      flatisfy/cmds.py
  13. 50
    57
      flatisfy/config.py
  14. 1
    1
      flatisfy/constants.py
  15. 5
    11
      flatisfy/data.py
  16. 74
    57
      flatisfy/data_files/__init__.py
  17. 863916
    1
      flatisfy/data_files/laposte.json
  18. 1
    3
      flatisfy/database/__init__.py
  19. 1
    1
      flatisfy/database/types.py
  20. 11
    22
      flatisfy/database/whooshalchemy.py
  21. 65
    39
      flatisfy/email.py
  22. 1
    0
      flatisfy/exceptions.py
  23. 45
    69
      flatisfy/fetch.py
  24. 71
    99
      flatisfy/filters/__init__.py
  25. 15
    6
      flatisfy/filters/cache.py
  26. 34
    66
      flatisfy/filters/duplicates.py
  27. 2
    9
      flatisfy/filters/images.py
  28. 145
    147
      flatisfy/filters/metadata.py
  29. 31
    34
      flatisfy/models/flat.py
  30. 4
    8
      flatisfy/models/postal_code.py
  31. 2
    3
      flatisfy/models/public_transport.py
  32. 112
    166
      flatisfy/tests.py
  33. 73
    85
      flatisfy/tools.py
  34. 33
    47
      flatisfy/web/app.py
  35. 4
    7
      flatisfy/web/configplugin.py
  36. 6
    7
      flatisfy/web/dbplugin.py
  37. 1
    1
      flatisfy/web/js_src/components/app.vue
  38. 398
    0
      flatisfy/web/js_src/components/flat.vue
  39. 41
    31
      flatisfy/web/js_src/components/flatsmap.vue
  40. 9
    12
      flatisfy/web/js_src/components/flatstableline.vue
  41. 68
    0
      flatisfy/web/js_src/components/notation.vue
  42. 3
    0
      flatisfy/web/js_src/main.js
  43. 61
    46
      flatisfy/web/js_src/store/getters.js
  44. 19
    2
      flatisfy/web/js_src/tools/index.js
  45. 4
    405
      flatisfy/web/js_src/views/details.vue
  46. 59
    36
      flatisfy/web/js_src/views/home.vue
  47. 6
    6
      flatisfy/web/js_src/views/search.vue
  48. 6
    6
      flatisfy/web/js_src/views/status.vue
  49. 93
    109
      flatisfy/web/routes/api.py
  50. 2
    0
      import.sh
  51. 6
    8
      migrations/env.py
  52. 3
    9
      migrations/versions/8155b83242eb_add_is_expired.py
  53. 24
    0
      migrations/versions/9e58c66f1ac1_add_flat_insee_column.py
  54. 69
    0
      migrations/versions/d21933db9ad8_add_flat_position_column.py
  55. 7
    3
      package.json
  56. 4
    2
      requirements.txt
  57. 18
    0
      start.sh
  58. 3
    8
      wsgi.py
  59. 7985
    0
      yarn.lock

+ 1
- 0
.dockerignore View File

@@ -0,0 +1 @@
1
+data

+ 10
- 0
.editorconfig View File

@@ -0,0 +1,10 @@
1
+root = true
2
+
3
+[*]
4
+indent_style = space
5
+indent_size = 4
6
+end_of_line = lf
7
+insert_final_newline = true
8
+
9
+[*.py]
10
+max_line_length=120

+ 3
- 0
.eslintrc View File

@@ -4,6 +4,9 @@
4 4
     "env": {
5 5
         "browser": true
6 6
     },
7
+    "parserOptions": {
8
+        "ecmaVersion": 8
9
+    },
7 10
     rules: {
8 11
         'indent': ["error", 4, { 'SwitchCase': 1 }],
9 12
     }

+ 0
- 2
.gitignore View File

@@ -9,9 +9,7 @@ flatisfy/web/static/assets
9 9
 data/
10 10
 package-lock.json
11 11
 doc/_build
12
-yarn.lock
13 12
 data_rework/
14 13
 .env
15 14
 weboob
16 15
 .htpasswd
17
-.vscode

+ 6
- 0
.vscode/extensions.json View File

@@ -0,0 +1,6 @@
1
+{
2
+    "recommendations": [
3
+        "mtxr.sqltools",
4
+        "mtxr.sqltools-driver-sqlite"
5
+    ]
6
+}

+ 15
- 0
.vscode/settings.json View File

@@ -0,0 +1,15 @@
1
+{
2
+    "cSpell.words": [
3
+        "Weboob",
4
+        "flatisfy"
5
+    ],
6
+    "sqltools.useNodeRuntime": true,
7
+    "sqltools.connections": [
8
+        {
9
+            "previewLimit": 50,
10
+            "driver": "SQLite",
11
+            "name": "flatisfy",
12
+            "database": "${workspaceFolder:flatisfy}/data/flatisfy.db"
13
+        }
14
+    ]
15
+}

+ 7
- 1
doc/0.getting_started.md View File

@@ -227,4 +227,10 @@ schema might change from time to time. Here is how to update it automatically:
227 227
 
228 228
 ### Other tools more or less connected with Flatisfy
229 229
 
230
-+ [ZipAround](https://github.com/guix77/ziparound) generates a list of ZIP codes centered on a city name, within a radius of N kilometers and within a certain travel time by car (France only)
230
++ [ZipAround](https://github.com/guix77/ziparound) generates a list of ZIP codes centered on a city name, within a radius of N kilometers and within a certain travel time by car (France only). You can invoke it with:
231
+
232
+```sh
233
+yarn ziparound
234
+# or alternatively
235
+yarn ziparound --code 75001 --distance 3
236
+```

+ 47
- 41
doc/conf.py View File

@@ -18,7 +18,8 @@
18 18
 
19 19
 import os
20 20
 import sys
21
-sys.path.insert(0, os.path.abspath('..'))
21
+
22
+sys.path.insert(0, os.path.abspath(".."))
22 23
 
23 24
 # -- General configuration ------------------------------------------------
24 25
 
@@ -30,19 +31,19 @@ sys.path.insert(0, os.path.abspath('..'))
30 31
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
31 32
 # ones.
32 33
 extensions = [
33
-    'sphinx.ext.autodoc',
34
-    'sphinx.ext.viewcode',
34
+    "sphinx.ext.autodoc",
35
+    "sphinx.ext.viewcode",
35 36
 ]
36 37
 
37 38
 # Add any paths that contain templates here, relative to this directory.
38
-templates_path = ['_templates']
39
+templates_path = ["_templates"]
39 40
 
40 41
 # The suffix(es) of source filenames.
41 42
 # You can specify multiple suffix as a list of string:
42 43
 #
43
-source_suffix = ['.rst', '.md']
44
+source_suffix = [".rst", ".md"]
44 45
 source_parsers = {
45
-   '.md': 'recommonmark.parser.CommonMarkParser',
46
+    ".md": "recommonmark.parser.CommonMarkParser",
46 47
 }
47 48
 
48 49
 # The encoding of source files.
@@ -50,21 +51,21 @@ source_parsers = {
50 51
 # source_encoding = 'utf-8-sig'
51 52
 
52 53
 # The master toctree document.
53
-master_doc = 'index'
54
+master_doc = "index"
54 55
 
55 56
 # General information about the project.
56
-project = u'Flatisfy'
57
-copyright = u'2017, Phyks (Lucas Verney)'
58
-author = u'Phyks (Lucas Verney)'
57
+project = u"Flatisfy"
58
+copyright = u"2017, Phyks (Lucas Verney)"
59
+author = u"Phyks (Lucas Verney)"
59 60
 
60 61
 # The version info for the project you're documenting, acts as replacement for
61 62
 # |version| and |release|, also used in various other places throughout the
62 63
 # built documents.
63 64
 #
64 65
 # The short X.Y version.
65
-version = u'0.1'
66
+version = u"0.1"
66 67
 # The full version, including alpha/beta/rc tags.
67
-release = u'0.1'
68
+release = u"0.1"
68 69
 
69 70
 # The language for content autogenerated by Sphinx. Refer to documentation
70 71
 # for a list of supported languages.
@@ -85,7 +86,7 @@ language = None
85 86
 # List of patterns, relative to source directory, that match files and
86 87
 # directories to ignore when looking for source files.
87 88
 # This patterns also effect to html_static_path and html_extra_path
88
-exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
89
+exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
89 90
 
90 91
 # The reST default role (used for this markup: `text`) to use for all
91 92
 # documents.
@@ -107,7 +108,7 @@ exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
107 108
 # show_authors = False
108 109
 
109 110
 # The name of the Pygments (syntax highlighting) style to use.
110
-pygments_style = 'sphinx'
111
+pygments_style = "sphinx"
111 112
 
112 113
 # A list of ignored prefixes for module index sorting.
113 114
 # modindex_common_prefix = []
@@ -124,7 +125,7 @@ todo_include_todos = False
124 125
 # The theme to use for HTML and HTML Help pages.  See the documentation for
125 126
 # a list of builtin themes.
126 127
 #
127
-html_theme = 'classic'
128
+html_theme = "classic"
128 129
 
129 130
 # Theme options are theme-specific and customize the look and feel of a theme
130 131
 # further.  For a list of options available for each theme, see the
@@ -158,7 +159,7 @@ html_theme = 'classic'
158 159
 # Add any paths that contain custom static files (such as style sheets) here,
159 160
 # relative to this directory. They are copied after the builtin static files,
160 161
 # so a file named "default.css" will overwrite the builtin "default.css".
161
-html_static_path = ['_static']
162
+html_static_path = ["_static"]
162 163
 
163 164
 # Add any extra paths that contain custom files (such as robots.txt or
164 165
 # .htaccess) here, relative to this directory. These files are copied
@@ -238,34 +239,36 @@ html_static_path = ['_static']
238 239
 # html_search_scorer = 'scorer.js'
239 240
 
240 241
 # Output file base name for HTML help builder.
241
-htmlhelp_basename = 'Flatisfydoc'
242
+htmlhelp_basename = "Flatisfydoc"
242 243
 
243 244
 # -- Options for LaTeX output ---------------------------------------------
244 245
 
245 246
 latex_elements = {
246
-     # The paper size ('letterpaper' or 'a4paper').
247
-     #
248
-     # 'papersize': 'letterpaper',
249
-
250
-     # The font size ('10pt', '11pt' or '12pt').
251
-     #
252
-     # 'pointsize': '10pt',
253
-
254
-     # Additional stuff for the LaTeX preamble.
255
-     #
256
-     # 'preamble': '',
257
-
258
-     # Latex figure (float) alignment
259
-     #
260
-     # 'figure_align': 'htbp',
247
+    # The paper size ('letterpaper' or 'a4paper').
248
+    #
249
+    # 'papersize': 'letterpaper',
250
+    # The font size ('10pt', '11pt' or '12pt').
251
+    #
252
+    # 'pointsize': '10pt',
253
+    # Additional stuff for the LaTeX preamble.
254
+    #
255
+    # 'preamble': '',
256
+    # Latex figure (float) alignment
257
+    #
258
+    # 'figure_align': 'htbp',
261 259
 }
262 260
 
263 261
 # Grouping the document tree into LaTeX files. List of tuples
264 262
 # (source start file, target name, title,
265 263
 #  author, documentclass [howto, manual, or own class]).
266 264
 latex_documents = [
267
-    (master_doc, 'Flatisfy.tex', u'Flatisfy Documentation',
268
-     u'Phyks (Lucas Verney)', 'manual'),
265
+    (
266
+        master_doc,
267
+        "Flatisfy.tex",
268
+        u"Flatisfy Documentation",
269
+        u"Phyks (Lucas Verney)",
270
+        "manual",
271
+    ),
269 272
 ]
270 273
 
271 274
 # The name of an image file (relative to this directory) to place at the top of
@@ -305,10 +308,7 @@ latex_documents = [
305 308
 
306 309
 # One entry per manual page. List of tuples
307 310
 # (source start file, name, description, authors, manual section).
308
-man_pages = [
309
-    (master_doc, 'flatisfy', u'Flatisfy Documentation',
310
-     [author], 1)
311
-]
311
+man_pages = [(master_doc, "flatisfy", u"Flatisfy Documentation", [author], 1)]
312 312
 
313 313
 # If true, show URL addresses after external links.
314 314
 #
@@ -321,9 +321,15 @@ man_pages = [
321 321
 # (source start file, target name, title, author,
322 322
 #  dir menu entry, description, category)
323 323
 texinfo_documents = [
324
-    (master_doc, 'Flatisfy', u'Flatisfy Documentation',
325
-     author, 'Flatisfy', 'One line description of project.',
326
-     'Miscellaneous'),
324
+    (
325
+        master_doc,
326
+        "Flatisfy",
327
+        u"Flatisfy Documentation",
328
+        author,
329
+        "Flatisfy",
330
+        "One line description of project.",
331
+        "Miscellaneous",
332
+    ),
327 333
 ]
328 334
 
329 335
 # Documents to append as an appendix to all manuals.

+ 1
- 1
docker/Dockerfile View File

@@ -19,7 +19,7 @@ RUN curl -sL https://deb.nodesource.com/setup_10.x | bash - \
19 19
     && apt-get install -y nodejs
20 20
 
21 21
 # Install weboob's code itself.
22
-RUN git clone --depth 1 https://git.weboob.org/weboob/devel /home/user/weboob \
22
+RUN git clone --depth 1 https://git.weboob.org/weboob/weboob /home/user/weboob \
23 23
     && cd /home/user/weboob \
24 24
     && pip install .
25 25
 

+ 1
- 0
docker/docker-compose.yml View File

@@ -9,3 +9,4 @@ services:
9 9
       - ./data:/flatisfy
10 10
     ports:
11 11
       - "8080:8080"
12
+    working_dir: /home/user/app

+ 60
- 88
flatisfy/__main__.py View File

@@ -17,6 +17,7 @@ from flatisfy import data
17 17
 from flatisfy import fetch
18 18
 from flatisfy import tools
19 19
 from flatisfy import tests
20
+
20 21
 # pylint: enable=locally-disabled,wrong-import-position
21 22
 
22 23
 
@@ -27,68 +28,47 @@ def parse_args(argv=None):
27 28
     """
28 29
     Create parser and parse arguments.
29 30
     """
30
-    parser = argparse.ArgumentParser(prog="Flatisfy",
31
-                                     description="Find the perfect flat.")
31
+    parser = argparse.ArgumentParser(prog="Flatisfy", description="Find the perfect flat.")
32 32
 
33 33
     # Parent parser containing arguments common to any subcommand
34 34
     parent_parser = argparse.ArgumentParser(add_help=False)
35
+    parent_parser.add_argument("--data-dir", help="Location of Flatisfy data directory.")
36
+    parent_parser.add_argument("--config", help="Configuration file to use.")
35 37
     parent_parser.add_argument(
36
-        "--data-dir",
37
-        help="Location of Flatisfy data directory."
38
-    )
39
-    parent_parser.add_argument(
40
-        "--config",
41
-        help="Configuration file to use."
42
-    )
43
-    parent_parser.add_argument(
44
-        "--passes", choices=[0, 1, 2, 3], type=int,
45
-        help="Number of passes to do on the filtered data."
46
-    )
47
-    parent_parser.add_argument(
48
-        "--max-entries", type=int,
49
-        help="Maximum number of entries to fetch."
50
-    )
51
-    parent_parser.add_argument(
52
-        "-v", "--verbose", action="store_true",
53
-        help="Verbose logging output."
54
-    )
55
-    parent_parser.add_argument(
56
-        "-vv", action="store_true",
57
-        help="Debug logging output."
38
+        "--passes",
39
+        choices=[0, 1, 2, 3],
40
+        type=int,
41
+        help="Number of passes to do on the filtered data.",
58 42
     )
43
+    parent_parser.add_argument("--max-entries", type=int, help="Maximum number of entries to fetch.")
44
+    parent_parser.add_argument("-v", "--verbose", action="store_true", help="Verbose logging output.")
45
+    parent_parser.add_argument("-vv", action="store_true", help="Debug logging output.")
59 46
     parent_parser.add_argument(
60
-        "--constraints", type=str,
61
-        help="Comma-separated list of constraints to consider."
47
+        "--constraints",
48
+        type=str,
49
+        help="Comma-separated list of constraints to consider.",
62 50
     )
63 51
 
64 52
     # Subcommands
65
-    subparsers = parser.add_subparsers(
66
-        dest="cmd", help="Available subcommands"
67
-    )
53
+    subparsers = parser.add_subparsers(dest="cmd", help="Available subcommands")
68 54
 
69 55
     # Build data subcommand
70
-    subparsers.add_parser(
71
-        "build-data", parents=[parent_parser],
72
-        help="Build necessary data"
73
-    )
56
+    subparsers.add_parser("build-data", parents=[parent_parser], help="Build necessary data")
74 57
 
75 58
     # Init config subcommand
76 59
     parser_init_config = subparsers.add_parser(
77
-        "init-config", parents=[parent_parser],
78
-        help="Initialize empty configuration."
79
-    )
80
-    parser_init_config.add_argument(
81
-        "output", nargs="?", help="Output config file. Use '-' for stdout."
60
+        "init-config", parents=[parent_parser], help="Initialize empty configuration."
82 61
     )
62
+    parser_init_config.add_argument("output", nargs="?", help="Output config file. Use '-' for stdout.")
83 63
 
84 64
     # Fetch subcommand parser
85
-    subparsers.add_parser("fetch", parents=[parent_parser],
86
-                          help="Fetch housings posts")
65
+    subparsers.add_parser("fetch", parents=[parent_parser], help="Fetch housings posts")
87 66
 
88 67
     # Filter subcommand parser
89 68
     parser_filter = subparsers.add_parser(
90
-        "filter", parents=[parent_parser],
91
-        help="Filter housings posts according to constraints in config."
69
+        "filter",
70
+        parents=[parent_parser],
71
+        help="Filter housings posts according to constraints in config.",
92 72
     )
93 73
     parser_filter.add_argument(
94 74
         "--input",
@@ -97,28 +77,29 @@ def parse_args(argv=None):
97 77
             "no additional fetching of infos is done, and the script outputs "
98 78
             "a filtered JSON dump on stdout. If not provided, update status "
99 79
             "of the flats in the database."
100
-        )
80
+        ),
101 81
     )
102 82
 
103 83
     # Import subcommand parser
104
-    subparsers.add_parser("import", parents=[parent_parser],
105
-                          help="Import housing posts in database.")
84
+    import_filter = subparsers.add_parser("import", parents=[parent_parser], help="Import housing posts in database.")
85
+    import_filter.add_argument(
86
+        "--new-only",
87
+        action="store_true",
88
+        help=("Download new housing posts only but do not refresh existing ones"),
89
+    )
106 90
 
107 91
     # Purge subcommand parser
108
-    subparsers.add_parser("purge", parents=[parent_parser],
109
-                          help="Purge database.")
92
+    subparsers.add_parser("purge", parents=[parent_parser], help="Purge database.")
110 93
 
111 94
     # Serve subcommand parser
112
-    parser_serve = subparsers.add_parser("serve", parents=[parent_parser],
113
-                                         help="Serve the web app.")
95
+    parser_serve = subparsers.add_parser("serve", parents=[parent_parser], help="Serve the web app.")
114 96
     parser_serve.add_argument("--port", type=int, help="Port to bind to.")
115 97
     parser_serve.add_argument("--host", help="Host to listen on.")
116 98
 
117 99
     # Test subcommand parser
118
-    subparsers.add_parser("test", parents=[parent_parser],
119
-                          help="Unit testing.")
100
+    subparsers.add_parser("test", parents=[parent_parser], help="Unit testing.")
120 101
 
121
-    return parser.parse_args(argv)
102
+    return parser, parser.parse_args(argv)
122 103
 
123 104
 
124 105
 def main():
@@ -127,25 +108,30 @@ def main():
127 108
     """
128 109
     # pylint: disable=locally-disabled,too-many-branches
129 110
     # Parse arguments
130
-    args = parse_args()
111
+    parser, args = parse_args()
131 112
 
132 113
     # Set logger
133
-    if args.vv:
134
-        logging.getLogger('').setLevel(logging.DEBUG)
135
-        logging.getLogger('sqlalchemy.engine').setLevel(logging.DEBUG)
136
-    elif args.verbose:
137
-        logging.getLogger('').setLevel(logging.INFO)
114
+    if getattr(args, 'vv', False):
115
+        logging.getLogger("").setLevel(logging.DEBUG)
116
+        logging.getLogger("titlecase").setLevel(logging.INFO)
117
+        logging.getLogger("sqlalchemy.engine").setLevel(logging.INFO)
118
+    elif getattr(args, 'verbose', False):
119
+        logging.getLogger("").setLevel(logging.INFO)
138 120
         # sqlalchemy INFO level is way too loud, just stick with WARNING
139
-        logging.getLogger('sqlalchemy.engine').setLevel(logging.WARNING)
121
+        logging.getLogger("sqlalchemy.engine").setLevel(logging.WARNING)
140 122
     else:
141
-        logging.getLogger('').setLevel(logging.WARNING)
142
-        logging.getLogger('sqlalchemy.engine').setLevel(logging.WARNING)
123
+        logging.getLogger("").setLevel(logging.WARNING)
124
+        logging.getLogger("sqlalchemy.engine").setLevel(logging.WARNING)
143 125
 
144 126
     # Init-config command
145 127
     if args.cmd == "init-config":
146 128
         flatisfy.config.init_config(args.output)
147 129
         sys.exit(0)
148 130
     else:
131
+        if not args.cmd:
132
+            parser.print_help()
133
+            sys.exit(0)
134
+
149 135
         # Load config
150 136
         if args.cmd == "build-data":
151 137
             # Data not yet built, do not use it in config checks
@@ -153,9 +139,11 @@ def main():
153 139
         else:
154 140
             config = flatisfy.config.load_config(args, check_with_data=True)
155 141
         if config is None:
156
-            LOGGER.error("Invalid configuration. Exiting. "
157
-                         "Run init-config before if this is the first time "
158
-                         "you run Flatisfy.")
142
+            LOGGER.error(
143
+                "Invalid configuration. Exiting. "
144
+                "Run init-config before if this is the first time "
145
+                "you run Flatisfy."
146
+            )
159 147
             sys.exit(1)
160 148
 
161 149
     # Purge command
@@ -171,18 +159,11 @@ def main():
171 159
     if args.cmd == "fetch":
172 160
         # Fetch and filter flats list
173 161
         fetched_flats = fetch.fetch_flats(config)
174
-        fetched_flats = cmds.filter_fetched_flats(config,
175
-                                                  fetched_flats=fetched_flats,
176
-                                                  fetch_details=True)
162
+        fetched_flats = cmds.filter_fetched_flats(config, fetched_flats=fetched_flats, fetch_details=True)
177 163
         # Sort by cost
178
-        fetched_flats = {
179
-            k: tools.sort_list_of_dicts_by(v["new"], "cost")
180
-            for k, v in fetched_flats.items()
181
-        }
182
-
183
-        print(
184
-            tools.pretty_json(fetched_flats)
185
-        )
164
+        fetched_flats = {k: tools.sort_list_of_dicts_by(v["new"], "cost") for k, v in fetched_flats.items()}
165
+
166
+        print(tools.pretty_json(fetched_flats))
186 167
         return
187 168
     # Filter command
188 169
     elif args.cmd == "filter":
@@ -190,28 +171,19 @@ def main():
190 171
         if args.input:
191 172
             fetched_flats = fetch.load_flats_from_file(args.input, config)
192 173
 
193
-            fetched_flats = cmds.filter_fetched_flats(
194
-                config,
195
-                fetched_flats=fetched_flats,
196
-                fetch_details=False
197
-            )
174
+            fetched_flats = cmds.filter_fetched_flats(config, fetched_flats=fetched_flats, fetch_details=False)
198 175
 
199 176
             # Sort by cost
200
-            fetched_flats = {
201
-                k: tools.sort_list_of_dicts_by(v["new"], "cost")
202
-                for k, v in fetched_flats.items()
203
-            }
177
+            fetched_flats = {k: tools.sort_list_of_dicts_by(v["new"], "cost") for k, v in fetched_flats.items()}
204 178
 
205 179
             # Output to stdout
206
-            print(
207
-                tools.pretty_json(fetched_flats)
208
-            )
180
+            print(tools.pretty_json(fetched_flats))
209 181
         else:
210 182
             cmds.import_and_filter(config, load_from_db=True)
211 183
         return
212 184
     # Import command
213 185
     elif args.cmd == "import":
214
-        cmds.import_and_filter(config, load_from_db=False)
186
+        cmds.import_and_filter(config, load_from_db=False, new_only=args.new_only)
215 187
         return
216 188
     # Serve command
217 189
     elif args.cmd == "serve":

+ 52
- 48
flatisfy/cmds.py View File

@@ -18,21 +18,23 @@ from flatisfy import fetch
18 18
 from flatisfy import tools
19 19
 from flatisfy.filters import metadata
20 20
 from flatisfy.web import app as web_app
21
-
21
+import time
22
+from ratelimit.exception import RateLimitException
22 23
 
23 24
 LOGGER = logging.getLogger(__name__)
24 25
 
25 26
 
26
-def filter_flats_list(config, constraint_name, flats_list, fetch_details=True):
27
+def filter_flats_list(config, constraint_name, flats_list, fetch_details=True, past_flats=None):
27 28
     """
28 29
     Filter the available flats list. Then, filter it according to criteria.
29 30
 
30 31
     :param config: A config dict.
31 32
     :param constraint_name: The constraint name that the ``flats_list`` should
32 33
         satisfy.
34
+    :param flats_list: The initial list of flat objects to filter.
33 35
     :param fetch_details: Whether additional details should be fetched between
34 36
         the two passes.
35
-    :param flats_list: The initial list of flat objects to filter.
37
+    :param past_flats: The list of already fetched flats
36 38
     :return: A dict mapping flat status and list of flat objects.
37 39
     """
38 40
     # Add the flatisfy metadata entry and prepare the flat objects
@@ -44,13 +46,9 @@ def filter_flats_list(config, constraint_name, flats_list, fetch_details=True):
44 46
     except KeyError:
45 47
         LOGGER.error(
46 48
             "Missing constraint %s. Skipping filtering for these posts.",
47
-            constraint_name
49
+            constraint_name,
48 50
         )
49
-        return {
50
-            "new": [],
51
-            "duplicate": [],
52
-            "ignored": []
53
-        }
51
+        return {"new": [], "duplicate": [], "ignored": []}
54 52
 
55 53
     first_pass_result = collections.defaultdict(list)
56 54
     second_pass_result = collections.defaultdict(list)
@@ -58,52 +56,55 @@ def filter_flats_list(config, constraint_name, flats_list, fetch_details=True):
58 56
     # Do a first pass with the available infos to try to remove as much
59 57
     # unwanted postings as possible
60 58
     if config["passes"] > 0:
61
-        first_pass_result = flatisfy.filters.first_pass(flats_list,
62
-                                                        constraint,
63
-                                                        config)
59
+        first_pass_result = flatisfy.filters.first_pass(flats_list, constraint, config)
64 60
     else:
65 61
         first_pass_result["new"] = flats_list
66 62
 
67 63
     # Load additional infos
68 64
     if fetch_details:
65
+        past_ids = {x["id"]: x for x in past_flats} if past_flats else {}
69 66
         for i, flat in enumerate(first_pass_result["new"]):
70
-            details = fetch.fetch_details(config, flat["id"])
67
+            details = None
68
+
69
+            use_cache = past_ids.get(flat["id"])
70
+            if use_cache:
71
+                LOGGER.debug("Skipping details download for %s.", flat["id"])
72
+                details = use_cache
73
+            else:
74
+                if flat["id"].split("@")[1] in ["seloger", "leboncoin"]:
75
+                    try:
76
+                        details = fetch.fetch_details_rate_limited(config, flat["id"])
77
+                    except RateLimitException:
78
+                        time.sleep(60)
79
+                        details = fetch.fetch_details_rate_limited(config, flat["id"])
80
+                else:
81
+                    details = fetch.fetch_details(config, flat["id"])
82
+
71 83
             first_pass_result["new"][i] = tools.merge_dicts(flat, details)
72 84
 
73 85
     # Do a second pass to consolidate all the infos we found and make use of
74 86
     # additional infos
75 87
     if config["passes"] > 1:
76
-        second_pass_result = flatisfy.filters.second_pass(
77
-            first_pass_result["new"], constraint, config
78
-        )
88
+        second_pass_result = flatisfy.filters.second_pass(first_pass_result["new"], constraint, config)
79 89
     else:
80 90
         second_pass_result["new"] = first_pass_result["new"]
81 91
 
82 92
     # Do a third pass to deduplicate better
83 93
     if config["passes"] > 2:
84
-        third_pass_result = flatisfy.filters.third_pass(
85
-            second_pass_result["new"],
86
-            config
87
-        )
94
+        third_pass_result = flatisfy.filters.third_pass(second_pass_result["new"], config)
88 95
     else:
89 96
         third_pass_result["new"] = second_pass_result["new"]
90 97
 
91 98
     return {
92 99
         "new": third_pass_result["new"],
93 100
         "duplicate": (
94
-            first_pass_result["duplicate"] +
95
-            second_pass_result["duplicate"] +
96
-            third_pass_result["duplicate"]
101
+            first_pass_result["duplicate"] + second_pass_result["duplicate"] + third_pass_result["duplicate"]
97 102
         ),
98
-        "ignored": (
99
-            first_pass_result["ignored"] +
100
-            second_pass_result["ignored"] +
101
-            third_pass_result["ignored"]
102
-        )
103
+        "ignored": (first_pass_result["ignored"] + second_pass_result["ignored"] + third_pass_result["ignored"]),
103 104
     }
104 105
 
105 106
 
106
-def filter_fetched_flats(config, fetched_flats, fetch_details=True):
107
+def filter_fetched_flats(config, fetched_flats, fetch_details=True, past_flats={}):
107 108
     """
108 109
     Filter the available flats list. Then, filter it according to criteria.
109 110
 
@@ -120,12 +121,13 @@ def filter_fetched_flats(config, fetched_flats, fetch_details=True):
120 121
             config,
121 122
             constraint_name,
122 123
             flats_list,
123
-            fetch_details
124
+            fetch_details,
125
+            past_flats.get(constraint_name, None),
124 126
         )
125 127
     return fetched_flats
126 128
 
127 129
 
128
-def import_and_filter(config, load_from_db=False):
130
+def import_and_filter(config, load_from_db=False, new_only=False):
129 131
     """
130 132
     Fetch the available flats list. Then, filter it according to criteria.
131 133
     Finally, store it in the database.
@@ -136,17 +138,23 @@ def import_and_filter(config, load_from_db=False):
136 138
     :return: ``None``.
137 139
     """
138 140
     # Fetch and filter flats list
141
+    past_flats = fetch.load_flats_from_db(config)
139 142
     if load_from_db:
140
-        fetched_flats = fetch.load_flats_from_db(config)
143
+        fetched_flats = past_flats
141 144
     else:
142 145
         fetched_flats = fetch.fetch_flats(config)
143 146
     # Do not fetch additional details if we loaded data from the db.
144
-    flats_by_status = filter_fetched_flats(config, fetched_flats=fetched_flats,
145
-                                           fetch_details=(not load_from_db))
147
+    flats_by_status = filter_fetched_flats(
148
+        config,
149
+        fetched_flats=fetched_flats,
150
+        fetch_details=(not load_from_db),
151
+        past_flats=past_flats if new_only else {},
152
+    )
146 153
     # Create database connection
147 154
     get_session = database.init_db(config["database"], config["search_index"])
148 155
 
149 156
     new_flats = []
157
+    result = []
150 158
 
151 159
     LOGGER.info("Merging fetched flats in database...")
152 160
     # Flatten the flats_by_status dict
@@ -159,14 +167,11 @@ def import_and_filter(config, load_from_db=False):
159 167
         # Set is_expired to true for all existing flats.
160 168
         # This will be set back to false if we find them during importing.
161 169
         for flat in session.query(flat_model.Flat).all():
162
-            flat.is_expired = True;
170
+            flat.is_expired = True
163 171
 
164 172
         for status, flats_list in flatten_flats_by_status.items():
165 173
             # Build SQLAlchemy Flat model objects for every available flat
166
-            flats_objects = {
167
-                flat_dict["id"]: flat_model.Flat.from_dict(flat_dict)
168
-                for flat_dict in flats_list
169
-            }
174
+            flats_objects = {flat_dict["id"]: flat_model.Flat.from_dict(flat_dict) for flat_dict in flats_list}
170 175
 
171 176
             if flats_objects:
172 177
                 # If there are some flats, try to merge them with the ones in
@@ -179,9 +184,7 @@ def import_and_filter(config, load_from_db=False):
179 184
                     # status if the user defined it
180 185
                     flat_object = flats_objects[each.id]
181 186
                     if each.status in flat_model.AUTOMATED_STATUSES:
182
-                        flat_object.status = getattr(
183
-                            flat_model.FlatStatus, status
184
-                        )
187
+                        flat_object.status = getattr(flat_model.FlatStatus, status)
185 188
                     else:
186 189
                         flat_object.status = each.status
187 190
 
@@ -198,21 +201,22 @@ def import_and_filter(config, load_from_db=False):
198 201
                 flat.status = getattr(flat_model.FlatStatus, status)
199 202
                 if flat.status == flat_model.FlatStatus.new:
200 203
                     new_flats.append(flat)
204
+                    result.append(flat.id)
201 205
 
202 206
             session.add_all(flats_objects.values())
203 207
 
204 208
         if config["send_email"]:
205 209
             email.send_notification(config, new_flats)
206 210
 
211
+    LOGGER.info(f"Found {len(result)} new flats.")
212
+
207 213
     # Touch a file to indicate last update timestamp
208
-    ts_file = os.path.join(
209
-        config["data_directory"],
210
-        "timestamp"
211
-    )
212
-    with open(ts_file, 'w'):
214
+    ts_file = os.path.join(config["data_directory"], "timestamp")
215
+    with open(ts_file, "w"):
213 216
         os.utime(ts_file, None)
214 217
 
215 218
     LOGGER.info("Done!")
219
+    return result
216 220
 
217 221
 
218 222
 def purge_db(config):
@@ -253,4 +257,4 @@ def serve(config):
253 257
         server = web_app.QuietWSGIRefServer
254 258
 
255 259
     print("Launching web viewer running on http://%s:%s" % (config["host"], config["port"]))
256
-    app.run(host=config["host"], port=config["port"], server=server)
260
+    app.run(host=config["host"], port=config["port"], server=server)

+ 50
- 57
flatisfy/config.py View File

@@ -30,24 +30,25 @@ DEFAULT_CONFIG = {
30 30
         "default": {
31 31
             "type": None,  # RENT, SALE, SHARING
32 32
             "house_types": [],  # List of house types, must be in APART, HOUSE,
33
-                                # PARKING, LAND, OTHER or UNKNOWN
33
+            # PARKING, LAND, OTHER or UNKNOWN
34 34
             "postal_codes": [],  # List of postal codes
35
+            "insees": [],  # List of postal codes
35 36
             "area": (None, None),  # (min, max) in m^2
36 37
             "cost": (None, None),  # (min, max) in currency unit
37 38
             "rooms": (None, None),  # (min, max)
38 39
             "bedrooms": (None, None),  # (min, max)
39 40
             "minimum_nb_photos": None,  # min number of photos
40 41
             "description_should_contain": [],  # list of terms
41
-            "description_should_not_contain": [  # list of terms
42
+            "description_should_not_contain": [
42 43
                 "vendu",
43 44
                 "Vendu",
44 45
                 "VENDU",
45
-                "recherche"
46
+                "recherche",
46 47
             ],
47 48
             "time_to": {}  # Dict mapping names to {"gps": [lat, lng],
48
-                           #                        "time": (min, max),
49
-                           #                        "mode": Valid mode }
50
-                           # Time is in seconds
49
+            #                        "time": (min, max),
50
+            #                        "mode": Valid mode }
51
+            # Time is in seconds
51 52
         }
52 53
     },
53 54
     # Whether or not to store personal data from housing posts (phone number
@@ -91,15 +92,17 @@ DEFAULT_CONFIG = {
91 92
     "backends": None,
92 93
     # Should email notifications be sent?
93 94
     "send_email": False,
94
-    "smtp_server": 'localhost',
95
+    "smtp_server": "localhost",
95 96
     "smtp_port": 25,
96 97
     "smtp_username": None,
97 98
     "smtp_password": None,
98 99
     "smtp_from": "noreply@flatisfy.org",
99 100
     "smtp_to": [],
101
+    "notification_lang": "en",
100 102
     # The web site url, to be used in email notifications. (doesn't matter
101 103
     # whether the trailing slash is present or not)
102
-    "website_url": "http://127.0.0.1:8080"
104
+    "website_url": "http://127.0.0.1:8080",
105
+    "ignore_station": False,
103 106
 }
104 107
 
105 108
 LOGGER = logging.getLogger(__name__)
@@ -114,20 +117,14 @@ def validate_config(config, check_with_data):
114 117
         check the config values.
115 118
     :return: ``True`` if the configuration is valid, ``False`` otherwise.
116 119
     """
120
+
117 121
     def _check_constraints_bounds(bounds):
118 122
         """
119 123
         Check the bounds for numeric constraints.
120 124
         """
121 125
         assert isinstance(bounds, list)
122 126
         assert len(bounds) == 2
123
-        assert all(
124
-            x is None or
125
-            (
126
-                isinstance(x, (float, int)) and
127
-                x >= 0
128
-            )
129
-            for x in bounds
130
-        )
127
+        assert all(x is None or (isinstance(x, (float, int)) and x >= 0) for x in bounds)
131 128
         if bounds[0] is not None and bounds[1] is not None:
132 129
             assert bounds[1] > bounds[0]
133 130
 
@@ -139,7 +136,9 @@ def validate_config(config, check_with_data):
139 136
         # pylint: disable=locally-disabled,line-too-long
140 137
 
141 138
         assert config["passes"] in [0, 1, 2, 3]
142
-        assert config["max_entries"] is None or (isinstance(config["max_entries"], int) and config["max_entries"] > 0)  # noqa: E501
139
+        assert config["max_entries"] is None or (
140
+            isinstance(config["max_entries"], int) and config["max_entries"] > 0
141
+        )  # noqa: E501
143 142
 
144 143
         assert config["data_directory"] is None or isinstance(config["data_directory"], str)  # noqa: E501
145 144
         assert os.path.isdir(config["data_directory"])
@@ -159,6 +158,7 @@ def validate_config(config, check_with_data):
159 158
         assert config["smtp_username"] is None or isinstance(config["smtp_username"], str)  # noqa: E501
160 159
         assert config["smtp_password"] is None or isinstance(config["smtp_password"], str)  # noqa: E501
161 160
         assert config["smtp_to"] is None or isinstance(config["smtp_to"], list)
161
+        assert config["notification_lang"] is None or isinstance(config["notification_lang"], str)
162 162
 
163 163
         assert isinstance(config["store_personal_data"], bool)
164 164
         assert isinstance(config["max_distance_housing_station"], (int, float))
@@ -169,6 +169,8 @@ def validate_config(config, check_with_data):
169 169
         assert config["navitia_api_key"] is None or isinstance(config["navitia_api_key"], str)  # noqa: E501
170 170
         assert config["mapbox_api_key"] is None or isinstance(config["mapbox_api_key"], str)  # noqa: E501
171 171
 
172
+        assert config["ignore_station"] is None or isinstance(config["ignore_station"], bool)  # noqa: E501
173
+
172 174
         # Ensure constraints are ok
173 175
         assert config["constraints"]
174 176
         for constraint in config["constraints"].values():
@@ -188,8 +190,7 @@ def validate_config(config, check_with_data):
188 190
                     assert isinstance(term, str)
189 191
 
190 192
             assert "description_should_not_contain" in constraint
191
-            assert isinstance(constraint["description_should_not_contain"],
192
-                              list)
193
+            assert isinstance(constraint["description_should_not_contain"], list)
193 194
             if constraint["description_should_not_contain"]:
194 195
                 for term in constraint["description_should_not_contain"]:
195 196
                     assert isinstance(term, str)
@@ -202,16 +203,22 @@ def validate_config(config, check_with_data):
202 203
             assert "postal_codes" in constraint
203 204
             assert constraint["postal_codes"]
204 205
             assert all(isinstance(x, str) for x in constraint["postal_codes"])
206
+            if "insee_codes" in constraint:
207
+                assert constraint["insee_codes"]
208
+                assert all(isinstance(x, str) for x in constraint["insee_codes"])
209
+
205 210
             if check_with_data:
206 211
                 # Ensure data is built into db
207 212
                 data.preprocess_data(config, force=False)
208 213
                 # Check postal codes
209
-                opendata_postal_codes = [
210
-                    x.postal_code
211
-                    for x in data.load_data(PostalCode, constraint, config)
212
-                ]
214
+                opendata = data.load_data(PostalCode, constraint, config)
215
+                opendata_postal_codes = [x.postal_code for x in opendata]
216
+                opendata_insee_codes = [x.insee_code for x in opendata]
213 217
                 for postal_code in constraint["postal_codes"]:
214 218
                     assert postal_code in opendata_postal_codes  # noqa: E501
219
+                if "insee_codes" in constraint:
220
+                    for insee in constraint["insee_codes"]:
221
+                        assert insee in opendata_insee_codes  # noqa: E501
215 222
 
216 223
             assert "area" in constraint
217 224
             _check_constraints_bounds(constraint["area"])
@@ -264,22 +271,18 @@ def load_config(args=None, check_with_data=True):
264 271
                 config_data.update(json.load(fh))
265 272
         except (IOError, ValueError) as exc:
266 273
             LOGGER.error(
267
-                "Unable to load configuration from file, "
268
-                "using default configuration: %s.",
269
-                exc
274
+                "Unable to load configuration from file, using default configuration: %s.",
275
+                exc,
270 276
             )
271 277
 
272 278
     # Overload config with arguments
273 279
     if args and getattr(args, "passes", None) is not None:
274
-        LOGGER.debug(
275
-            "Overloading number of passes from CLI arguments: %d.",
276
-            args.passes
277
-        )
280
+        LOGGER.debug("Overloading number of passes from CLI arguments: %d.", args.passes)
278 281
         config_data["passes"] = args.passes
279 282
     if args and getattr(args, "max_entries", None) is not None:
280 283
         LOGGER.debug(
281 284
             "Overloading maximum number of entries from CLI arguments: %d.",
282
-            args.max_entries
285
+            args.max_entries,
283 286
         )
284 287
         config_data["max_entries"] = args.max_entries
285 288
     if args and getattr(args, "port", None) is not None:
@@ -294,49 +297,39 @@ def load_config(args=None, check_with_data=True):
294 297
         LOGGER.debug("Overloading data directory from CLI arguments.")
295 298
         config_data["data_directory"] = args.data_dir
296 299
     elif config_data["data_directory"] is None:
297
-        config_data["data_directory"] = appdirs.user_data_dir(
298
-            "flatisfy",
299
-            "flatisfy"
300
-        )
301
-        LOGGER.debug("Using default XDG data directory: %s.",
302
-                     config_data["data_directory"])
300
+        config_data["data_directory"] = appdirs.user_data_dir("flatisfy", "flatisfy")
301
+        LOGGER.debug("Using default XDG data directory: %s.", config_data["data_directory"])
303 302
 
304 303
     if not os.path.isdir(config_data["data_directory"]):
305
-        LOGGER.info("Creating data directory according to config: %s",
306
-                    config_data["data_directory"])
304
+        LOGGER.info(
305
+            "Creating data directory according to config: %s",
306
+            config_data["data_directory"],
307
+        )
307 308
         os.makedirs(config_data["data_directory"])
308 309
         os.makedirs(os.path.join(config_data["data_directory"], "images"))
309 310
 
310 311
     if config_data["database"] is None:
311
-        config_data["database"] = "sqlite:///" + os.path.join(
312
-            config_data["data_directory"],
313
-            "flatisfy.db"
314
-        )
312
+        config_data["database"] = "sqlite:///" + os.path.join(config_data["data_directory"], "flatisfy.db")
315 313
 
316 314
     if config_data["search_index"] is None:
317
-        config_data["search_index"] = os.path.join(
318
-            config_data["data_directory"],
319
-            "search_index"
320
-        )
315
+        config_data["search_index"] = os.path.join(config_data["data_directory"], "search_index")
321 316
 
322 317
     # Handle constraints filtering
323 318
     if args and getattr(args, "constraints", None) is not None:
324 319
         LOGGER.info(
325
-            ("Filtering constraints from config according to CLI argument. "
326
-             "Using only the following constraints: %s."),
327
-            args.constraints.replace(",", ", ")
320
+            (
321
+                "Filtering constraints from config according to CLI argument. "
322
+                "Using only the following constraints: %s."
323
+            ),
324
+            args.constraints.replace(",", ", "),
328 325
         )
329 326
         constraints_filter = args.constraints.split(",")
330
-        config_data["constraints"] = {
331
-            k: v
332
-            for k, v in config_data["constraints"].items()
333
-            if k in constraints_filter
334
-        }
327
+        config_data["constraints"] = {k: v for k, v in config_data["constraints"].items() if k in constraints_filter}
335 328
 
336 329
     # Sanitize website url
337 330
     if config_data["website_url"] is not None:
338
-        if config_data["website_url"][-1] != '/':
339
-            config_data["website_url"] += '/'
331
+        if config_data["website_url"][-1] != "/":
332
+            config_data["website_url"] += "/"
340 333
 
341 334
     config_validation = validate_config(config_data, check_with_data)
342 335
     if config_validation is True:

+ 1
- 1
flatisfy/constants.py View File

@@ -16,7 +16,7 @@ BACKENDS_BY_PRECEDENCE = [
16 16
     "pap",
17 17
     "leboncoin",
18 18
     "explorimmo",
19
-    "logicimmo"
19
+    "logicimmo",
20 20
 ]
21 21
 
22 22
 

+ 5
- 11
flatisfy/data.py View File

@@ -24,11 +24,13 @@ except ImportError:
24 24
     try:
25 25
         from functools32 import lru_cache
26 26
     except ImportError:
27
+
27 28
         def lru_cache(maxsize=None):  # pylint: disable=unused-argument
28 29
             """
29 30
             Identity implementation of ``lru_cache`` for fallback.
30 31
             """
31 32
             return lambda func: func
33
+
32 34
         LOGGER.warning(
33 35
             "`functools.lru_cache` is not available on your system. Consider "
34 36
             "installing `functools32` Python module if using Python2 for "
@@ -48,10 +50,7 @@ def preprocess_data(config, force=False):
48 50
     # Check if a build is required
49 51
     get_session = database.init_db(config["database"], config["search_index"])
50 52
     with get_session() as session:
51
-        is_built = (
52
-            session.query(PublicTransport).count() > 0 and
53
-            session.query(PostalCode).count() > 0
54
-        )
53
+        is_built = session.query(PublicTransport).count() > 0 and session.query(PostalCode).count() > 0
55 54
         if is_built and not force:
56 55
             # No need to rebuild the database, skip
57 56
             return False
@@ -64,9 +63,7 @@ def preprocess_data(config, force=False):
64 63
     for preprocess in data_files.PREPROCESSING_FUNCTIONS:
65 64
         data_objects = preprocess()
66 65
         if not data_objects:
67
-            raise flatisfy.exceptions.DataBuildError(
68
-                "Error with %s." % preprocess.__name__
69
-            )
66
+            raise flatisfy.exceptions.DataBuildError("Error with %s." % preprocess.__name__)
70 67
         with get_session() as session:
71 68
             session.add_all(data_objects)
72 69
     LOGGER.info("Done building data!")
@@ -96,10 +93,7 @@ def load_data(model, constraint, config):
96 93
         # Load data for each area
97 94
         areas = list(set(areas))
98 95
         for area in areas:
99
-            results.extend(
100
-                session.query(model)
101
-                .filter(model.area == area).all()
102
-            )
96
+            results.extend(session.query(model).filter(model.area == area).all())
103 97
         # Expunge loaded data from the session to be able to use them
104 98
         # afterwards
105 99
         session.expunge_all()

+ 74
- 57
flatisfy/data_files/__init__.py View File

@@ -24,8 +24,8 @@ MODULE_DIR = os.path.dirname(os.path.realpath(__file__))
24 24
 
25 25
 titlecase.set_small_word_list(
26 26
     # Add French small words
27
-    r"l|d|un|une|et|à|a|sur|ou|le|la|de|lès|les|" +
28
-    titlecase.SMALL
27
+    r"l|d|un|une|et|à|a|sur|ou|le|la|de|lès|les|"
28
+    + titlecase.SMALL
29 29
 )
30 30
 
31 31
 TRANSPORT_DATA_FILES = {
@@ -33,7 +33,7 @@ TRANSPORT_DATA_FILES = {
33 33
     "FR-NW": "stops_fr-nw.txt",
34 34
     "FR-NE": "stops_fr-ne.txt",
35 35
     "FR-SW": "stops_fr-sw.txt",
36
-    "FR-SE": "stops_fr-se.txt"
36
+    "FR-SE": "stops_fr-se.txt",
37 37
 }
38 38
 
39 39
 
@@ -51,8 +51,20 @@ def french_postal_codes_to_quarter(postal_code):
51 51
     # French departements
52 52
     # Taken from Wikipedia data.
53 53
     department_to_subdivision = {
54
-        "FR-ARA": ["01", "03", "07", "15", "26", "38", "42", "43", "63", "69",
55
-                   "73", "74"],
54
+        "FR-ARA": [
55
+            "01",
56
+            "03",
57
+            "07",
58
+            "15",
59
+            "26",
60
+            "38",
61
+            "42",
62
+            "43",
63
+            "63",
64
+            "69",
65
+            "73",
66
+            "74",
67
+        ],
56 68
         "FR-BFC": ["21", "25", "39", "58", "70", "71", "89", "90"],
57 69
         "FR-BRE": ["22", "29", "35", "44", "56"],
58 70
         "FR-CVL": ["18", "28", "36", "37", "41", "45"],
@@ -61,36 +73,53 @@ def french_postal_codes_to_quarter(postal_code):
61 73
         "FR-HDF": ["02", "59", "60", "62", "80"],
62 74
         "FR-IDF": ["75", "77", "78", "91", "92", "93", "94", "95"],
63 75
         "FR-NOR": ["14", "27", "50", "61", "76"],
64
-        "FR-NAQ": ["16", "17", "19", "23", "24", "33", "40", "47", "64", "79",
65
-                   "86", "87"],
66
-        "FR-OCC": ["09", "11", "12", "30", "31", "32", "34", "46", "48", "65",
67
-                   "66", "81", "82"],
76
+        "FR-NAQ": [
77
+            "16",
78
+            "17",
79
+            "19",
80
+            "23",
81
+            "24",
82
+            "33",
83
+            "40",
84
+            "47",
85
+            "64",
86
+            "79",
87
+            "86",
88
+            "87",
89
+        ],
90
+        "FR-OCC": [
91
+            "09",
92
+            "11",
93
+            "12",
94
+            "30",
95
+            "31",
96
+            "32",
97
+            "34",
98
+            "46",
99
+            "48",
100
+            "65",
101
+            "66",
102
+            "81",
103
+            "82",
104
+        ],
68 105
         "FR-PDL": ["44", "49", "53", "72", "85"],
69
-        "FR-PAC": ["04", "05", "06", "13", "83", "84"]
106
+        "FR-PAC": ["04", "05", "06", "13", "83", "84"],
70 107
     }
71 108
     subdivision_to_quarters = {
72
-        'FR-IDF': ['FR-IDF'],
73
-        'FR-NW': ['FR-BRE', 'FR-CVL', 'FR-NOR', 'FR-PDL'],
74
-        'FR-NE': ['FR-BFC', 'FR-GES', 'FR-HDF'],
75
-        'FR-SE': ['FR-ARA', 'FR-COR', 'FR-PAC', 'FR-OCC'],
76
-        'FR-SW': ['FR-NAQ']
109
+        "FR-IDF": ["FR-IDF"],
110
+        "FR-NW": ["FR-BRE", "FR-CVL", "FR-NOR", "FR-PDL"],
111
+        "FR-NE": ["FR-BFC", "FR-GES", "FR-HDF"],
112
+        "FR-SE": ["FR-ARA", "FR-COR", "FR-PAC", "FR-OCC"],
113
+        "FR-SW": ["FR-NAQ"],
77 114
     }
78 115
 
79 116
     subdivision = next(
80
-        (
81
-            i
82
-            for i, departments in department_to_subdivision.items()
83
-            if departement in departments
84
-        ),
85
-        None
117
+        (i for i, departments in department_to_subdivision.items() if departement in departments),
118
+        None,
86 119
     )
87 120
     return next(
88
-        (
89
-            i
90
-            for i, subdivisions in subdivision_to_quarters.items()
91
-            if subdivision in subdivisions
92
-        ),
93
-        None
121
+        (i for i, subdivisions in subdivision_to_quarters.items() if subdivision in subdivisions),
122
+        None,
94 123
     )
95 124
 
96 125
 
@@ -106,9 +135,7 @@ def _preprocess_laposte():
106 135
     raw_laposte_data = []
107 136
     # Load opendata file
108 137
     try:
109
-        with io.open(
110
-            os.path.join(MODULE_DIR, data_file), "r", encoding='utf-8'
111
-        ) as fh:
138
+        with io.open(os.path.join(MODULE_DIR, data_file), "r", encoding="utf-8") as fh:
112 139
             raw_laposte_data = json.load(fh)
113 140
     except (IOError, ValueError):
114 141
         LOGGER.error("Invalid raw LaPoste opendata file.")
@@ -124,31 +151,30 @@ def _preprocess_laposte():
124 151
         try:
125 152
             area = french_postal_codes_to_quarter(fields["code_postal"])
126 153
             if area is None:
127
-                LOGGER.info(
154
+                LOGGER.debug(
128 155
                     "No matching area found for postal code %s, skipping it.",
129
-                    fields["code_postal"]
156
+                    fields["code_postal"],
130 157
                 )
131 158
                 continue
132 159
 
133
-            name = normalize_string(
134
-                titlecase.titlecase(fields["nom_de_la_commune"]),
135
-                lowercase=False
136
-            )
160
+            name = normalize_string(titlecase.titlecase(fields["nom_de_la_commune"]), lowercase=False)
137 161
 
138 162
             if (fields["code_postal"], name) in seen_postal_codes:
139 163
                 continue
140 164
 
141 165
             seen_postal_codes.append((fields["code_postal"], name))
142
-            postal_codes_data.append(PostalCode(
143
-                area=area,
144
-                postal_code=fields["code_postal"],
145
-                name=name,
146
-                lat=fields["coordonnees_gps"][0],
147
-                lng=fields["coordonnees_gps"][1]
148
-            ))
166
+            postal_codes_data.append(
167
+                PostalCode(
168
+                    area=area,
169
+                    postal_code=fields["code_postal"],
170
+                    insee_code=fields["code_commune_insee"],
171
+                    name=name,
172
+                    lat=fields["coordonnees_gps"][0],
173
+                    lng=fields["coordonnees_gps"][1],
174
+                )
175
+            )
149 176
         except KeyError:
150
-            LOGGER.info("Missing data for postal code %s, skipping it.",
151
-                        fields["code_postal"])
177
+            LOGGER.debug("Missing data for postal code %s, skipping it.", fields["code_postal"])
152 178
 
153 179
     return postal_codes_data
154 180
 
@@ -164,17 +190,11 @@ def _preprocess_public_transport():
164 190
     for area, data_file in TRANSPORT_DATA_FILES.items():
165 191
         LOGGER.info("Building from public transport data %s.", data_file)
166 192
         try:
167
-            with io.open(os.path.join(MODULE_DIR, data_file), "r",
168
-                         encoding='utf-8') as fh:
193
+            with io.open(os.path.join(MODULE_DIR, data_file), "r", encoding="utf-8") as fh:
169 194
                 filereader = csv.reader(fh)
170 195
                 next(filereader, None)  # Skip first row (headers)
171 196
                 for row in filereader:
172
-                    public_transport_data.append(PublicTransport(
173
-                        name=row[2],
174
-                        area=area,
175
-                        lat=row[3],
176
-                        lng=row[4]
177
-                    ))
197
+                    public_transport_data.append(PublicTransport(name=row[2], area=area, lat=row[3], lng=row[4]))
178 198
         except (IOError, IndexError):
179 199
             LOGGER.error("Invalid raw opendata file: %s.", data_file)
180 200
             return []
@@ -183,7 +203,4 @@ def _preprocess_public_transport():
183 203
 
184 204
 
185 205
 # List of all the available preprocessing functions. Order can be important.
186
-PREPROCESSING_FUNCTIONS = [
187
-    _preprocess_laposte,
188
-    _preprocess_public_transport
189
-]
206
+PREPROCESSING_FUNCTIONS = [_preprocess_laposte, _preprocess_public_transport]

+ 863916
- 1
flatisfy/data_files/laposte.json
File diff suppressed because it is too large
View File


+ 1
- 3
flatisfy/database/__init__.py View File

@@ -47,9 +47,7 @@ def init_db(database_uri=None, search_db_uri=None):
47 47
     Session = sessionmaker(bind=engine)  # pylint: disable=locally-disabled,invalid-name
48 48
 
49 49
     if search_db_uri:
50
-        index_service = IndexService(
51
-            whoosh_base=search_db_uri
52
-        )
50
+        index_service = IndexService(whoosh_base=search_db_uri)
53 51
         index_service.register_class(flatisfy.models.flat.Flat)
54 52
 
55 53
     @contextmanager

+ 1
- 1
flatisfy/database/types.py View File

@@ -50,4 +50,4 @@ class StringyJSON(types.TypeDecorator):
50 50
 # TypeEngine.with_variant says "use StringyJSON instead when
51 51
 # connecting to 'sqlite'"
52 52
 # pylint: disable=locally-disabled,invalid-name
53
-MagicJSON = types.JSON().with_variant(StringyJSON, 'sqlite')
53
+MagicJSON = types.JSON().with_variant(StringyJSON, "sqlite")

+ 11
- 22
flatisfy/database/whooshalchemy.py View File

@@ -30,7 +30,6 @@ from whoosh.qparser import MultifieldParser
30 30
 
31 31
 
32 32
 class IndexService(object):
33
-
34 33
     def __init__(self, config=None, whoosh_base=None):
35 34
         if not whoosh_base and config:
36 35
             whoosh_base = config.get("WHOOSH_BASE")
@@ -84,8 +83,7 @@ class IndexService(object):
84 83
                 primary = field.name
85 84
                 continue
86 85
             if field.name in model_class.__searchable__:
87
-                schema[field.name] = whoosh.fields.TEXT(
88
-                    analyzer=StemmingAnalyzer())
86
+                schema[field.name] = whoosh.fields.TEXT(analyzer=StemmingAnalyzer())
89 87
         return Schema(**schema), primary
90 88
 
91 89
     def before_commit(self, session):
@@ -93,21 +91,18 @@ class IndexService(object):
93 91
 
94 92
         for model in session.new:
95 93
             model_class = model.__class__
96
-            if hasattr(model_class, '__searchable__'):
97
-                self.to_update.setdefault(model_class.__name__, []).append(
98
-                    ("new", model))
94
+            if hasattr(model_class, "__searchable__"):
95
+                self.to_update.setdefault(model_class.__name__, []).append(("new", model))
99 96
 
100 97
         for model in session.deleted:
101 98
             model_class = model.__class__
102
-            if hasattr(model_class, '__searchable__'):
103
-                self.to_update.setdefault(model_class.__name__, []).append(
104
-                    ("deleted", model))
99
+            if hasattr(model_class, "__searchable__"):
100
+                self.to_update.setdefault(model_class.__name__, []).append(("deleted", model))
105 101
 
106 102
         for model in session.dirty:
107 103
             model_class = model.__class__
108
-            if hasattr(model_class, '__searchable__'):
109
-                self.to_update.setdefault(model_class.__name__, []).append(
110
-                    ("changed", model))
104
+            if hasattr(model_class, "__searchable__"):
105
+                self.to_update.setdefault(model_class.__name__, []).append(("changed", model))
111 106
 
112 107
     def after_commit(self, session):
113 108
         """
@@ -128,16 +123,11 @@ class IndexService(object):
128 123
                     # added as a new doc. Could probably replace this with a whoosh
129 124
                     # update.
130 125
 
131
-                    writer.delete_by_term(
132
-                        primary_field, text_type(getattr(model, primary_field)))
126
+                    writer.delete_by_term(primary_field, text_type(getattr(model, primary_field)))
133 127
 
134 128
                     if change_type in ("new", "changed"):
135
-                        attrs = dict((key, getattr(model, key))
136
-                                     for key in searchable)
137
-                        attrs = {
138
-                            attr: text_type(getattr(model, attr))
139
-                            for attr in attrs.keys()
140
-                        }
129
+                        attrs = dict((key, getattr(model, key)) for key in searchable)
130
+                        attrs = {attr: text_type(getattr(model, attr)) for attr in attrs.keys()}
141 131
                         attrs[primary_field] = text_type(getattr(model, primary_field))
142 132
                         writer.add_document(**attrs)
143 133
 
@@ -158,8 +148,7 @@ class Searcher(object):
158 148
         self.parser = MultifieldParser(list(fields), index.schema)
159 149
 
160 150
     def __call__(self, session, query, limit=None):
161
-        results = self.index.searcher().search(
162
-            self.parser.parse(query), limit=limit)
151
+        results = self.index.searcher().search(self.parser.parse(query), limit=limit)
163 152
 
164 153
         keys = [x[self.primary] for x in results]
165 154
         primary_column = getattr(self.model_class, self.primary)

+ 65
- 39
flatisfy/email.py View File

@@ -8,7 +8,7 @@ from builtins import str
8 8
 
9 9
 import logging
10 10
 import smtplib
11
-
11
+from money import Money
12 12
 from email.mime.multipart import MIMEMultipart
13 13
 from email.mime.text import MIMEText
14 14
 from email.utils import formatdate, make_msgid
@@ -36,15 +36,15 @@ def send_email(server, port, subject, _from, _to, txt, html, username=None, pass
36 36
     if username or password:
37 37
         server.login(username or "", password or "")
38 38
 
39
-    msg = MIMEMultipart('alternative')
40
-    msg['Subject'] = subject
41
-    msg['From'] = _from
42
-    msg['To'] = ', '.join(_to)
43
-    msg['Date'] = formatdate()
44
-    msg['Message-ID'] = make_msgid()
39
+    msg = MIMEMultipart("alternative")
40
+    msg["Subject"] = subject
41
+    msg["From"] = _from
42
+    msg["To"] = ", ".join(_to)
43
+    msg["Date"] = formatdate()
44
+    msg["Message-ID"] = make_msgid()
45 45
 
46
-    msg.attach(MIMEText(txt, 'plain', 'utf-8'))
47
-    msg.attach(MIMEText(html, 'html', 'utf-8'))
46
+    msg.attach(MIMEText(txt, "plain", "utf-8"))
47
+    msg.attach(MIMEText(html, "html", "utf-8"))
48 48
 
49 49
     server.sendmail(_from, _to, msg.as_string())
50 50
     server.quit()
@@ -61,13 +61,33 @@ def send_notification(config, flats):
61 61
     if not flats:
62 62
         return
63 63
 
64
-    txt = u'Hello dear user,\n\nThe following new flats have been found:\n\n'
65
-    html = """
64
+    i18n = {
65
+        "en": {
66
+            "subject": f"{len(flats)} new flats found!",
67
+            "hello": "Hello dear user",
68
+            "following_new_flats": "The following new flats have been found:",
69
+            "area": "area",
70
+            "cost": "cost",
71
+            "signature": "Hope you'll find what you were looking for.",
72
+        },
73
+        "fr": {
74
+            "subject": f"{len(flats)} nouvelles annonces disponibles !",
75
+            "hello": "Bonjour cher utilisateur",
76
+            "following_new_flats": "Voici les nouvelles annonces :",
77
+            "area": "surface",
78
+            "cost": "coût",
79
+            "signature": "Bonne recherche",
80
+        },
81
+    }
82
+    trs = i18n.get(config["notification_lang"], "en")
83
+
84
+    txt = trs["hello"] + ",\n\n\n\n"
85
+    html = f"""
66 86
     <html>
67 87
       <head></head>
68 88
       <body>
69
-        <p>Hello dear user!</p>
70
-        <p>The following new flats have been found:
89
+        <p>{trs["hello"]}!</p>
90
+        <p>{trs["following_new_flats"]}
71 91
 
72 92
             <ul>
73 93
     """
@@ -77,41 +97,47 @@ def send_notification(config, flats):
77 97
     for flat in flats:
78 98
         title = str(flat.title)
79 99
         flat_id = str(flat.id)
80
-        area = str(flat.area)
81
-        cost = str(flat.cost)
100
+        area = str(int(flat.area))
101
+        cost = int(flat.cost)
82 102
         currency = str(flat.currency)
83 103
 
84
-        txt += (
85
-            '- {}: {}#/flat/{} (area: {}, cost: {} {})\n'.format(
86
-                title, website_url, flat_id, area, cost, currency
87
-            )
88
-        )
89
-
90
-        html += """
104
+        txt += f"- {title}: {website_url}#/flat/{flat_id}"
105
+        html += f"""
91 106
             <li>
92
-                <a href="{}#/flat/{}">{}</a>
93
-                (area: {}, cost: {} {})
94
-            </li>
95
-        """.format(website_url, flat_id, title, area, cost, currency)
107
+                <a href="{website_url}#/flat/{flat_id}">{title}</a>
108
+        """
109
+
110
+        fields = []
111
+        if area:
112
+            fields.append(f"{trs['area']}: {area}m²")
113
+        if cost:
114
+            money = Money(cost, currency).format(config["notification_lang"])
115
+            fields.append(f"{trs['cost']}: {money}")
116
+
117
+        if len(fields):
118
+            txt += f'({", ".join(fields)})'
119
+            html += f'({", ".join(fields)})'
120
+
121
+        html += "</li>"
96 122
 
97 123
     html += "</ul>"
98 124
 
99
-    signature = (
100
-        u"\nHope you'll find what you were looking for.\n\nBye!\nFlatisfy"
101
-    )
125
+    signature = f"\n{trs['signature']}\n\nBye!\nFlatisfy"
102 126
     txt += signature
103
-    html += signature.replace('\n', '<br>')
127
+    html += signature.replace("\n", "<br>")
104 128
 
105 129
     html += """</p>
106 130
       </body>
107 131
     </html>"""
108 132
 
109
-    send_email(config["smtp_server"],
110
-               config["smtp_port"],
111
-               "New flats found!",
112
-               config["smtp_from"],
113
-               config["smtp_to"],
114
-               txt,
115
-               html,
116
-               config.get("smtp_username"),
117
-               config.get("smtp_password"))
133
+    send_email(
134
+        config["smtp_server"],
135
+        config["smtp_port"],
136
+        trs["subject"],
137
+        config["smtp_from"],
138
+        config["smtp_to"],
139
+        txt,
140
+        html,
141
+        config.get("smtp_username"),
142
+        config.get("smtp_password"),
143
+    )

+ 1
- 0
flatisfy/exceptions.py View File

@@ -10,4 +10,5 @@ class DataBuildError(Exception):
10 10
     """
11 11
     Error occurring on building a data file.
12 12
     """
13
+
13 14
     pass

+ 45
- 69
flatisfy/fetch.py View File

@@ -9,6 +9,7 @@ import collections
9 9
 import itertools
10 10
 import json
11 11
 import logging
12
+from ratelimit import limits
12 13
 
13 14
 from flatisfy import database
14 15
 from flatisfy import tools
@@ -24,8 +25,7 @@ try:
24 25
     from weboob.core.ouiboube import WebNip
25 26
     from weboob.tools.json import WeboobEncoder
26 27
 except ImportError:
27
-    LOGGER.error("Weboob is not available on your system. Make sure you "
28
-                 "installed it.")
28
+    LOGGER.error("Weboob is not available on your system. Make sure you installed it.")
29 29
     raise
30 30
 
31 31
 
@@ -34,6 +34,7 @@ class WebOOBProxy(object):
34 34
     Wrapper around WebOOB ``WebNip`` class, to fetch housing posts without
35 35
     having to spawn a subprocess.
36 36
     """
37
+
37 38
     @staticmethod
38 39
     def version():
39 40
         """
@@ -77,14 +78,14 @@ class WebOOBProxy(object):
77 78
         self.webnip = WebNip(modules_path=config["modules_path"])
78 79
 
79 80
         # Create backends
80
-        self.backends = [
81
-            self.webnip.load_backend(
82
-                module,
83
-                module,
84
-                params={}
85
-            )
86
-            for module in backends
87
-        ]
81
+        self.backends = []
82
+        for module in backends:
83
+            try:
84
+                self.backends.append(
85
+                    self.webnip.load_backend(module, module, params={})
86
+                )
87
+            except Exception as exc:
88
+                raise Exception('Unable to load module ' + module) from exc
88 89
 
89 90
     def __enter__(self):
90 91
         return self
@@ -114,28 +115,21 @@ class WebOOBProxy(object):
114 115
             except CallErrors as exc:
115 116
                 # If an error occured, just log it
116 117
                 LOGGER.error(
117
-                    (
118
-                        "An error occured while building query for "
119
-                        "postal code %s: %s"
120
-                    ),
118
+                    ("An error occured while building query for postal code %s: %s"),
121 119
                     postal_code,
122
-                    str(exc)
120
+                    str(exc),
123 121
                 )
124 122
 
125 123
                 if not matching_cities:
126 124
                     # If postal code gave no match, warn the user
127
-                    LOGGER.warn(
128
-                        "Postal code %s could not be matched with a city.",
129
-                        postal_code
130
-                    )
125
+                    LOGGER.warn("Postal code %s could not be matched with a city.", postal_code)
131 126
 
132 127
         # Remove "TOUTES COMMUNES" entry which are duplicates of the individual
133 128
         # cities entries in Logicimmo module.
134 129
         matching_cities = [
135 130
             city
136 131
             for city in matching_cities
137
-            if not (city.backend == 'logicimmo' and
138
-                    city.name.startswith('TOUTES COMMUNES'))
132
+            if not (city.backend == "logicimmo" and city.name.startswith("TOUTES COMMUNES"))
139 133
         ]
140 134
 
141 135
         # Then, build queries by grouping cities by at most 3
@@ -145,21 +139,14 @@ class WebOOBProxy(object):
145 139
 
146 140
             try:
147 141
                 query.house_types = [
148
-                    getattr(
149
-                        HOUSE_TYPES,
150
-                        house_type.upper()
151
-                    )
152
-                    for house_type in constraints_dict["house_types"]
142
+                    getattr(HOUSE_TYPES, house_type.upper()) for house_type in constraints_dict["house_types"]
153 143
                 ]
154 144
             except AttributeError:
155 145
                 LOGGER.error("Invalid house types constraint.")
156 146
                 return None
157 147
 
158 148
             try:
159
-                query.type = getattr(
160
-                    POSTS_TYPES,
161
-                    constraints_dict["type"].upper()
162
-                )
149
+                query.type = getattr(POSTS_TYPES, constraints_dict["type"].upper())
163 150
             except AttributeError:
164 151
                 LOGGER.error("Invalid post type constraint.")
165 152
                 return None
@@ -190,26 +177,22 @@ class WebOOBProxy(object):
190 177
         # TODO: Handle max_entries better
191 178
         try:
192 179
             for housing in itertools.islice(
193
-                    self.webnip.do(
194
-                        'search_housings',
195
-                        query,
196
-                        # Only run the call on the required backends.
197
-                        # Otherwise, WebOOB is doing weird stuff and returning
198
-                        # nonsense.
199
-                        backends=[x for x in self.backends
200
-                                  if x.name in useful_backends]
201
-                    ),
202
-                    max_entries
180
+                self.webnip.do(
181
+                    "search_housings",
182
+                    query,
183
+                    # Only run the call on the required backends.
184
+                    # Otherwise, WebOOB is doing weird stuff and returning
185
+                    # nonsense.
186
+                    backends=[x for x in self.backends if x.name in useful_backends],
187
+                ),
188
+                max_entries,
203 189
             ):
204 190
                 if not store_personal_data:
205 191
                     housing.phone = None
206 192
                 housings.append(json.dumps(housing, cls=WeboobEncoder))
207 193
         except CallErrors as exc:
208 194
             # If an error occured, just log it
209
-            LOGGER.error(
210
-                "An error occured while fetching the housing posts: %s",
211
-                str(exc)
212
-            )
195
+            LOGGER.error("An error occured while fetching the housing posts: %s", str(exc))
213 196
         return housings
214 197
 
215 198
     def info(self, full_flat_id, store_personal_data=False):
@@ -224,34 +207,26 @@ class WebOOBProxy(object):
224 207
         """
225 208
         flat_id, backend_name = full_flat_id.rsplit("@", 1)
226 209
         try:
227
-            backend = next(
228
-                backend
229
-                for backend in self.backends
230
-                if backend.name == backend_name
231
-            )
210
+            backend = next(backend for backend in self.backends if backend.name == backend_name)
232 211
         except StopIteration:
233 212
             LOGGER.error("Backend %s is not available.", backend_name)
234 213
             return "{}"
235 214
 
236 215
         try:
237 216
             housing = backend.get_housing(flat_id)
238
-            # Otherwise, we miss the @backend afterwards
239
-            housing.id = full_flat_id
240 217
             if not store_personal_data:
241 218
                 # Ensure phone is cleared
242 219
                 housing.phone = None
243 220
             else:
244 221
                 # Ensure phone is fetched
245
-                backend.fillobj(housing, 'phone')
222
+                backend.fillobj(housing, "phone")
223
+            # Otherwise, we miss the @backend afterwards
224
+            housing.id = full_flat_id
246 225
 
247 226
             return json.dumps(housing, cls=WeboobEncoder)
248 227
         except Exception as exc:  # pylint: disable=broad-except
249 228
             # If an error occured, just log it
250
-            LOGGER.error(
251
-                "An error occured while fetching housing %s: %s",
252
-                full_flat_id,
253
-                str(exc)
254
-            )
229
+            LOGGER.error("An error occured while fetching housing %s: %s", full_flat_id, str(exc))
255 230
             return "{}"
256 231
 
257 232
 
@@ -271,19 +246,24 @@ def fetch_flats(config):
271 246
             queries = webOOB_proxy.build_queries(constraint)
272 247
             housing_posts = []
273 248
             for query in queries:
274
-                housing_posts.extend(
275
-                    webOOB_proxy.query(query, config["max_entries"],
276
-                                       config["store_personal_data"])
277
-                )
249
+                housing_posts.extend(webOOB_proxy.query(query, config["max_entries"], config["store_personal_data"]))
250
+        housing_posts = housing_posts[: config["max_entries"]]
278 251
         LOGGER.info("Fetched %d flats.", len(housing_posts))
279 252
 
280 253
         constraint_flats_list = [json.loads(flat) for flat in housing_posts]
281
-        constraint_flats_list = [WebOOBProxy.restore_decimal_fields(flat)
282
-                                 for flat in constraint_flats_list]
254
+        constraint_flats_list = [WebOOBProxy.restore_decimal_fields(flat) for flat in constraint_flats_list]
283 255
         fetched_flats[constraint_name] = constraint_flats_list
284 256
     return fetched_flats
285 257
 
286 258
 
259
+@limits(calls=10, period=60)
260
+def fetch_details_rate_limited(config, flat_id):
261
+    """
262
+    Limit flats fetching to at most 10 calls per minute to avoid rate banning
263
+    """
264
+    return fetch_details(config, flat_id)
265
+
266
+
287 267
 def fetch_details(config, flat_id):
288 268
     """
289 269
     Fetch the additional details for a flat using Flatboob / WebOOB.
@@ -294,8 +274,7 @@ def fetch_details(config, flat_id):
294 274
     """
295 275
     with WebOOBProxy(config) as webOOB_proxy:
296 276
         LOGGER.info("Loading additional details for flat %s.", flat_id)
297
-        webOOB_output = webOOB_proxy.info(flat_id,
298
-                                          config["store_personal_data"])
277
+        webOOB_output = webOOB_proxy.info(flat_id, config["store_personal_data"])
299 278
 
300 279
     flat_details = json.loads(webOOB_output)
301 280
     flat_details = WebOOBProxy.restore_decimal_fields(flat_details)
@@ -326,10 +305,7 @@ def load_flats_from_file(json_file, config):
326 305
         LOGGER.info("Found %d flats.", len(flats_list))
327 306
     except (IOError, ValueError):
328 307
         LOGGER.error("File %s is not a valid dump file.", json_file)
329
-    return {
330
-        constraint_name: flats_list
331
-        for constraint_name in config["constraints"]
332
-    }
308
+    return {constraint_name: flats_list for constraint_name in config["constraints"]}
333 309
 
334 310
 
335 311
 def load_flats_from_db(config):

+ 71
- 99
flatisfy/filters/__init__.py View File

@@ -36,48 +36,51 @@ def refine_with_housing_criteria(flats_list, constraint):
36 36
     for i, flat in enumerate(flats_list):
37 37
         # Check postal code
38 38
         postal_code = flat["flatisfy"].get("postal_code", None)
39
-        if (
40
-                postal_code and
41
-                postal_code not in constraint["postal_codes"]
42
-        ):
43
-            LOGGER.info("Postal code for flat %s is out of range.", flat["id"])
44
-            is_ok[i] = is_ok[i] and False
39
+        if postal_code and postal_code not in constraint["postal_codes"]:
40
+            LOGGER.info(
41
+                "Postal code %s for flat %s is out of range (%s).",
42
+                postal_code,
43
+                flat["id"],
44
+                ", ".join(constraint["postal_codes"]),
45
+            )
46
+            is_ok[i] = False
47
+        # Check insee code
48
+        insee_code = flat["flatisfy"].get("insee_code", None)
49
+        if insee_code and "insee_codes" in constraint and insee_code not in constraint["insee_codes"]:
50
+            LOGGER.info(
51
+                "insee code %s for flat %s is out of range (%s).",
52
+                insee_code,
53
+                flat["id"],
54
+                ", ".join(constraint["insee_codes"]),
55
+            )
56
+            is_ok[i] = False
45 57
 
46 58
         # Check time_to
47 59
         for place_name, time in flat["flatisfy"].get("time_to", {}).items():
48 60
             time = time["time"]
49
-            is_within_interval = tools.is_within_interval(
50
-                time,
51
-                *(constraint["time_to"][place_name]["time"])
52
-            )
61
+            is_within_interval = tools.is_within_interval(time, *(constraint["time_to"][place_name]["time"]))
53 62
             if not is_within_interval:
54
-                LOGGER.info("Flat %s is too far from place %s: %ds.",
55
-                            flat["id"], place_name, time)
63
+                LOGGER.info(
64
+                    "Flat %s is too far from place %s: %ds.",
65
+                    flat["id"],
66
+                    place_name,
67
+                    time,
68
+                )
56 69
             is_ok[i] = is_ok[i] and is_within_interval
57 70
 
58 71
         # Check other fields
59 72
         for field in ["area", "cost", "rooms", "bedrooms"]:
60 73
             interval = constraint[field]
61
-            is_within_interval = tools.is_within_interval(
62
-                flat.get(field, None),
63
-                *interval
64
-            )
74
+            is_within_interval = tools.is_within_interval(flat.get(field, None), *interval)
65 75
             if not is_within_interval:
66
-                LOGGER.info("%s for flat %s is out of range.",
67
-                            field.capitalize(), flat["id"])
76
+                LOGGER.info(
77
+                    "%s %s for flat %s is out of range.", field.capitalize(), str(flat.get(field, None)), flat["id"]
78
+                )
68 79
             is_ok[i] = is_ok[i] and is_within_interval
69 80
 
70 81
     return (
71
-        [
72
-            flat
73
-            for i, flat in enumerate(flats_list)
74
-            if is_ok[i]
75
-        ],
76
-        [
77
-            flat
78
-            for i, flat in enumerate(flats_list)
79
-            if not is_ok[i]
80
-        ]
82
+        [flat for i, flat in enumerate(flats_list) if is_ok[i]],
83
+        [flat for i, flat in enumerate(flats_list) if not is_ok[i]],
81 84
     )
82 85
 
83 86
 
@@ -103,54 +106,36 @@ def refine_with_details_criteria(flats_list, constraint):
103 106
 
104 107
     for i, flat in enumerate(flats_list):
105 108
         # Check number of pictures
106
-        has_enough_photos = tools.is_within_interval(
107
-            len(flat.get('photos', [])),
108
-            constraint['minimum_nb_photos'],
109
-            None
110
-        )
109
+        has_enough_photos = tools.is_within_interval(len(flat.get("photos", [])), constraint["minimum_nb_photos"], None)
111 110
         if not has_enough_photos:
112 111
             LOGGER.info(
113 112
                 "Flat %s only has %d photos, it should have at least %d.",
114 113
                 flat["id"],
115
-                len(flat['photos']),
116
-                constraint['minimum_nb_photos']
114
+                len(flat["photos"]),
115
+                constraint["minimum_nb_photos"],
117 116
             )
118 117
             is_ok[i] = False
119 118
 
120
-        has_all_good_terms_in_description = True
121
-        if constraint["description_should_contain"]:
122
-            has_all_good_terms_in_description = all(
123
-                term in flat['text']
124
-                for term in constraint["description_should_contain"]
125
-            )
126
-
127
-        has_a_bad_term_in_description = False
128
-        if constraint["description_should_not_contain"]:
129
-            has_a_bad_term_in_description = any(
130
-                term in flat['text']
131
-                for term in constraint["description_should_not_contain"]
132
-            )
133
-
134
-        if (not has_all_good_terms_in_description
135
-            or has_a_bad_term_in_description):
136
-            LOGGER.info(
137
-                ("Description for flat %s does not contain all the required "
138
-                 "terms, or contains a blacklisted term."),
139
-                flat["id"]
140
-            )
141
-            is_ok[i] = False
119
+        for term in constraint["description_should_contain"]:
120
+            if term.lower() not in flat["text"].lower():
121
+                LOGGER.info(
122
+                    ("Description for flat %s does not contain required term '%s'."),
123
+                    flat["id"],
124
+                    term,
125
+                )
126
+                is_ok[i] = False
127
+        for term in constraint["description_should_not_contain"]:
128
+            if term.lower() in flat["text"].lower():
129
+                LOGGER.info(
130
+                    ("Description for flat %s contains blacklisted term '%s'."),
131
+                    flat["id"],
132
+                    term,
133
+                )
134
+                is_ok[i] = False
142 135
 
143 136
     return (
144
-        [
145
-            flat
146
-            for i, flat in enumerate(flats_list)
147
-            if is_ok[i]
148
-        ],
149
-        [
150
-            flat
151
-            for i, flat in enumerate(flats_list)
152
-            if not is_ok[i]
153
-        ]
137
+        [flat for i, flat in enumerate(flats_list) if is_ok[i]],
138
+        [flat for i, flat in enumerate(flats_list) if not is_ok[i]],
154 139
     )
155 140
 
156 141
 
@@ -172,30 +157,25 @@ def first_pass(flats_list, constraint, config):
172 157
 
173 158
     # Handle duplicates based on ids
174 159
     # Just remove them (no merge) as they should be the exact same object.
175
-    flats_list, _ = duplicates.detect(
176
-        flats_list, key="id", merge=False, should_intersect=False
177
-    )
160
+    flats_list, _ = duplicates.detect(flats_list, key="id", merge=False, should_intersect=False)
178 161
     # Also merge duplicates based on urls (these may come from different
179 162
     # flatboob backends)
180 163
     # This is especially useful as some websites such as entreparticuliers
181 164
     # contains a lot of leboncoin housings posts.
182
-    flats_list, duplicates_by_urls = duplicates.detect(
183
-        flats_list, key="urls", merge=True, should_intersect=True
184
-    )
165
+    flats_list, duplicates_by_urls = duplicates.detect(flats_list, key="urls", merge=True, should_intersect=True)
185 166
 
186 167
     # Guess the postal codes
187 168
     flats_list = metadata.guess_postal_code(flats_list, constraint, config)
188
-    # Try to match with stations
189
-    flats_list = metadata.guess_stations(flats_list, constraint, config)
169
+
170
+    if not config["ignore_station"]:
171
+        # Try to match with stations
172
+        flats_list = metadata.guess_stations(flats_list, constraint, config)
173
+
190 174
     # Remove returned housing posts that do not match criteria
191
-    flats_list, ignored_list = refine_with_housing_criteria(flats_list,
192
-                                                            constraint)
175
+    flats_list, ignored_list = refine_with_housing_criteria(flats_list, constraint)
176
+
177
+    return {"new": flats_list, "ignored": ignored_list, "duplicate": duplicates_by_urls}
193 178
 
194
-    return {
195
-        "new": flats_list,
196
-        "ignored": ignored_list,
197
-        "duplicate": duplicates_by_urls
198
-    }
199 179
 
200 180
 @tools.timeit
201 181
 def second_pass(flats_list, constraint, config):
@@ -222,28 +202,24 @@ def second_pass(flats_list, constraint, config):
222 202
     flats_list = metadata.guess_postal_code(flats_list, constraint, config)
223 203
 
224 204
     # Better match with stations (confirm and check better)
225
-    flats_list = metadata.guess_stations(flats_list, constraint, config)
205
+    if not config["ignore_station"]:
206
+        flats_list = metadata.guess_stations(flats_list, constraint, config)
226 207
 
227
-    # Compute travel time to specified points
228
-    flats_list = metadata.compute_travel_times(flats_list, constraint, config)
208
+        # Compute travel time to specified points
209
+        flats_list = metadata.compute_travel_times(flats_list, constraint, config)
229 210
 
230 211
     # Remove returned housing posts that do not match criteria
231
-    flats_list, ignored_list = refine_with_housing_criteria(flats_list,
232
-                                                            constraint)
212
+    flats_list, ignored_list = refine_with_housing_criteria(flats_list, constraint)
233 213
 
234 214
     # Remove returned housing posts which do not match criteria relying on
235 215
     # fetched details.
236
-    flats_list, ignored_list = refine_with_details_criteria(flats_list,
237
-                                                            constraint)
216
+    flats_list, ignored_list = refine_with_details_criteria(flats_list, constraint)
238 217
 
239 218
     if config["serve_images_locally"]:
240 219
         images.download_images(flats_list, config)
241 220
 
242
-    return {
243
-        "new": flats_list,
244
-        "ignored": ignored_list,
245
-        "duplicate": []
246
-    }
221
+    return {"new": flats_list, "ignored": ignored_list, "duplicate": []}
222
+
247 223
 
248 224
 @tools.timeit
249 225
 def third_pass(flats_list, config):
@@ -262,8 +238,4 @@ def third_pass(flats_list, config):
262 238
     # Deduplicate the list using every available data
263 239
     flats_list, duplicate_flats = duplicates.deep_detect(flats_list, config)
264 240
 
265
-    return {
266
-        "new": flats_list,
267
-        "ignored": [],
268
-        "duplicate": duplicate_flats
269
-    }
241
+    return {"new": flats_list, "ignored": [], "duplicate": duplicate_flats}

+ 15
- 6
flatisfy/filters/cache.py View File

@@ -9,15 +9,19 @@ import collections
9 9
 import hashlib
10 10
 import os
11 11
 import requests
12
+import logging
12 13
 from io import BytesIO
13 14
 
14 15
 import PIL.Image
15 16
 
17
+LOGGER = logging.getLogger(__name__)
18
+
16 19
 
17 20
 class MemoryCache(object):
18 21
     """
19 22
     A cache in memory.
20 23
     """
24
+
21 25
     @staticmethod
22 26
     def on_miss(key):
23 27
         """
@@ -85,6 +89,7 @@ class ImageCache(MemoryCache):
85 89
     """
86 90
     A cache for images, stored in memory.
87 91
     """
92
+
88 93
     @staticmethod
89 94
     def compute_filename(url):
90 95
         """
@@ -104,23 +109,27 @@ class ImageCache(MemoryCache):
104 109
         if len(self.map.keys()) > self.max_items:
105 110
             self.map.popitem(last=False)
106 111
 
112
+        if url.endswith(".svg"):
113
+            # Skip SVG photo which are unsupported and unlikely to be relevant
114
+            return None
115
+
116
+        filepath = None
107 117
         # Try to load from local folder
108 118
         if self.storage_dir:
109
-            filepath = os.path.join(
110
-                self.storage_dir,
111
-                self.compute_filename(url)
112
-            )
119
+            filepath = os.path.join(self.storage_dir, self.compute_filename(url))
113 120
             if os.path.isfile(filepath):
114 121
                 return PIL.Image.open(filepath)
115 122
         # Otherwise, fetch it
116 123
         try:
124
+            LOGGER.debug(f"Download photo from {url} to {filepath}")
117 125
             req = requests.get(url)
118 126
             req.raise_for_status()
119 127
             image = PIL.Image.open(BytesIO(req.content))
120
-            if self.storage_dir:
128
+            if filepath:
121 129
                 image.save(filepath, format=image.format)
122 130
             return image
123
-        except (requests.HTTPError, IOError):
131
+        except (requests.HTTPError, IOError) as exc:
132
+            LOGGER.info(f"Download photo from {url} failed: {exc}")
124 133
             return None
125 134
 
126 135
     def __init__(self, max_items=200, storage_dir=None):

+ 34
- 66
flatisfy/filters/duplicates.py View File

@@ -35,14 +35,14 @@ def homogeneize_phone_number(numbers):
35 35
 
36 36
     clean_numbers = []
37 37
 
38
-    for number in numbers.split(','):
38
+    for number in numbers.split(","):
39 39
         number = number.strip()
40 40
         number = number.replace(".", "")
41 41
         number = number.replace(" ", "")
42 42
         number = number.replace("-", "")
43 43
         number = number.replace("(", "")
44 44
         number = number.replace(")", "")
45
-        number = re.sub(r'^\+\d\d', "", number)
45
+        number = re.sub(r"^\+\d\d", "", number)
46 46
 
47 47
         if not number.startswith("0"):
48 48
             number = "0" + number
@@ -94,12 +94,7 @@ def compare_photos(photo1, photo2, photo_cache, hash_threshold):
94 94
         return False
95 95
 
96 96
 
97
-def find_number_common_photos(
98
-    flat1_photos,
99
-    flat2_photos,
100
-    photo_cache,
101
-    hash_threshold
102
-):
97
+def find_number_common_photos(flat1_photos, flat2_photos, photo_cache, hash_threshold):
103 98
     """
104 99
     Compute the number of common photos between the two lists of photos for the
105 100
     flats.
@@ -174,22 +169,21 @@ def detect(flats_list, key="id", merge=True, should_intersect=False):
174 169
             # Sort matching flats by backend precedence
175 170
             matching_flats.sort(
176 171
                 key=lambda flat: next(
177
-                    i for (i, backend) in enumerate(BACKENDS_BY_PRECEDENCE)
178
-                    if flat["id"].endswith(backend)
172
+                    i for (i, backend) in enumerate(BACKENDS_BY_PRECEDENCE) if flat["id"].endswith(backend)
179 173
                 ),
180
-                reverse=True
174
+                reverse=True,
181 175
             )
182 176
 
183 177
             if len(matching_flats) > 1:
184
-                LOGGER.info("Found duplicates using key \"%s\": %s.",
185
-                            key,
186
-                            [flat["id"] for flat in matching_flats])
178
+                LOGGER.info(
179
+                    'Found duplicates using key "%s": %s.',
180
+                    key,
181
+                    [flat["id"] for flat in matching_flats],
182
+                )
187 183
             # Otherwise, check the policy
188 184
             if merge:
189 185
                 # If a merge is requested, do the merge
190
-                unique_flats_list.append(
191
-                    tools.merge_dicts(*matching_flats)
192
-                )
186
+                unique_flats_list.append(tools.merge_dicts(*matching_flats))
193 187
             else:
194 188
                 # Otherwise, just keep the most important of them
195 189
                 unique_flats_list.append(matching_flats[-1])
@@ -203,8 +197,7 @@ def detect(flats_list, key="id", merge=True, should_intersect=False):
203 197
     if should_intersect:
204 198
         # We added some flats twice with the above method, let's deduplicate on
205 199
         # id.
206
-        unique_flats_list, _ = detect(unique_flats_list, key="id", merge=True,
207
-                                      should_intersect=False)
200
+        unique_flats_list, _ = detect(unique_flats_list, key="id", merge=True, should_intersect=False)
208 201
 
209 202
     return unique_flats_list, duplicate_flats
210 203
 
@@ -250,14 +243,12 @@ def get_duplicate_score(flat1, flat2, photo_cache, hash_threshold):
250 243
 
251 244
         # They should have the same postal code, if available
252 245
         if (
253
-                "flatisfy" in flat1 and "flatisfy" in flat2 and
254
-                flat1["flatisfy"].get("postal_code", None) and
255
-                flat2["flatisfy"].get("postal_code", None)
246
+            "flatisfy" in flat1
247
+            and "flatisfy" in flat2
248
+            and flat1["flatisfy"].get("postal_code", None)
249
+            and flat2["flatisfy"].get("postal_code", None)
256 250
         ):
257
-            assert (
258
-                flat1["flatisfy"]["postal_code"] ==
259
-                flat2["flatisfy"]["postal_code"]
260
-            )
251
+            assert flat1["flatisfy"]["postal_code"] == flat2["flatisfy"]["postal_code"]
261 252
             n_common_items += 1
262 253
 
263 254
         # TODO: Better text comparison (one included in the other, fuzzymatch)
@@ -279,28 +270,16 @@ def get_duplicate_score(flat1, flat2, photo_cache, hash_threshold):
279 270
         # If the two flats are from the same website and have a
280 271
         # different float part, consider they cannot be duplicates. See
281 272
         # https://framagit.org/phyks/Flatisfy/issues/100.
282
-        both_are_from_same_backend = (
283
-            flat1["id"].split("@")[-1] == flat2["id"].split("@")[-1]
284
-        )
285
-        both_have_float_part = (
286
-            (flat1["area"] % 1) > 0 and (flat2["area"] % 1) > 0
287
-        )
288
-        both_have_equal_float_part = (
289
-            (flat1["area"] % 1) == (flat2["area"] % 1)
290
-        )
273
+        both_are_from_same_backend = flat1["id"].split("@")[-1] == flat2["id"].split("@")[-1]
274
+        both_have_float_part = (flat1["area"] % 1) > 0 and (flat2["area"] % 1) > 0
275
+        both_have_equal_float_part = (flat1["area"] % 1) == (flat2["area"] % 1)
291 276
         if both_have_float_part and both_are_from_same_backend:
292 277
             assert both_have_equal_float_part
293 278
 
294 279
         if flat1.get("photos", []) and flat2.get("photos", []):
295
-            n_common_photos = find_number_common_photos(
296
-                flat1["photos"],
297
-                flat2["photos"],
298
-                photo_cache,
299
-                hash_threshold
300
-            )
280
+            n_common_photos = find_number_common_photos(flat1["photos"], flat2["photos"], photo_cache, hash_threshold)
301 281
 
302
-            min_number_photos = min(len(flat1["photos"]),
303
-                                    len(flat2["photos"]))
282
+            min_number_photos = min(len(flat1["photos"]), len(flat2["photos"]))
304 283
 
305 284
             # Either all the photos are the same, or there are at least
306 285
             # three common photos.
@@ -332,9 +311,7 @@ def deep_detect(flats_list, config):
332 311
         storage_dir = os.path.join(config["data_directory"], "images")
333 312
     else:
334 313
         storage_dir = None
335
-    photo_cache = ImageCache(
336
-        storage_dir=storage_dir
337
-    )
314
+    photo_cache = ImageCache(storage_dir=storage_dir)
338 315
 
339 316
     LOGGER.info("Running deep duplicates detection.")
340 317
     matching_flats = collections.defaultdict(list)
@@ -347,30 +324,26 @@ def deep_detect(flats_list, config):
347 324
             if flat2["id"] in matching_flats[flat1["id"]]:
348 325
                 continue
349 326
 
350
-            n_common_items = get_duplicate_score(
351
-                flat1,
352
-                flat2,
353
-                photo_cache,
354
-                config["duplicate_image_hash_threshold"]
355
-            )
327
+            n_common_items = get_duplicate_score(flat1, flat2, photo_cache, config["duplicate_image_hash_threshold"])
356 328
 
357 329
             # Minimal score to consider they are duplicates
358 330
             if n_common_items >= config["duplicate_threshold"]:
359 331
                 # Mark flats as duplicates
360 332
                 LOGGER.info(
361
-                    ("Found duplicates using deep detection: (%s, %s). "
362
-                     "Score is %d."),
333
+                    ("Found duplicates using deep detection: (%s, %s). Score is %d."),
363 334
                     flat1["id"],
364 335
                     flat2["id"],
365
-                    n_common_items
336
+                    n_common_items,
366 337
                 )
367 338
                 matching_flats[flat1["id"]].append(flat2["id"])
368 339
                 matching_flats[flat2["id"]].append(flat1["id"])
369 340
 
370 341
     if photo_cache.total():
371
-        LOGGER.debug("Photo cache: hits: %d%% / misses: %d%%.",
372
-                     photo_cache.hit_rate(),
373
-                     photo_cache.miss_rate())
342
+        LOGGER.debug(
343
+            "Photo cache: hits: %d%% / misses: %d%%.",
344
+            photo_cache.hit_rate(),
345
+            photo_cache.miss_rate(),
346
+        )
374 347
 
375 348
     seen_ids = []
376 349
     duplicate_flats = []
@@ -381,16 +354,11 @@ def deep_detect(flats_list, config):
381 354
 
382 355
         seen_ids.extend(matching_flats[flat_id])
383 356
         to_merge = sorted(
384
-            [
385
-                flat
386
-                for flat in flats_list
387
-                if flat["id"] in matching_flats[flat_id]
388
-            ],
357
+            [flat for flat in flats_list if flat["id"] in matching_flats[flat_id]],
389 358
             key=lambda flat: next(
390
-                i for (i, backend) in enumerate(BACKENDS_BY_PRECEDENCE)
391
-                if flat["id"].endswith(backend)
359
+                i for (i, backend) in enumerate(BACKENDS_BY_PRECEDENCE) if flat["id"].endswith(backend)
392 360
             ),
393
-            reverse=True
361
+            reverse=True,
394 362
         )
395 363
         unique_flats_list.append(tools.merge_dicts(*to_merge))
396 364
         # The ID of the added merged flat will be the one of the last item

+ 2
- 9
flatisfy/filters/images.py View File

@@ -22,15 +22,8 @@ def download_images(flats_list, config):
22 22
     :param flats_list: A list of flats dicts.
23 23
     :param config: A config dict.
24 24
     """
25
-    photo_cache = ImageCache(
26
-        storage_dir=os.path.join(config["data_directory"], "images")
27
-    )
28
-    flats_list_length = len(flats_list)
29
-    for i, flat in enumerate(flats_list):
30
-        LOGGER.info(
31
-            "Downloading photos for flat %d/%d: %s.",
32
-            i + 1, flats_list_length, flat["id"]
33
-        )
25
+    photo_cache = ImageCache(storage_dir=os.path.join(config["data_directory"], "images"))
26
+    for flat in flats_list:
34 27
         for photo in flat["photos"]:
35 28
             # Download photo
36 29
             image = photo_cache.get(photo["url"])

+ 145
- 147
flatisfy/filters/metadata.py View File

@@ -76,10 +76,10 @@ def fuzzy_match(query, choices, limit=3, threshold=75):
76 76
 
77 77
     Example::
78 78
 
79
-        >>> match("Paris 14ème", ["Ris", "ris", "Paris 14"], limit=1)
79
+        >>> fuzzy_match("Paris 14ème", ["Ris", "ris", "Paris 14"], limit=1)
80 80
         [("Paris 14", 100)
81 81
 
82
-        >>> match( \
82
+        >>> fuzzy_match( \
83 83
                 "Saint-Jacques, Denfert-Rochereau (Colonel Rol-Tanguy), " \
84 84
                 "Mouton-Duvernet", \
85 85
                 ["saint-jacques", "denfert rochereau", "duvernet", "toto"], \
@@ -88,8 +88,8 @@ def fuzzy_match(query, choices, limit=3, threshold=75):
88 88
         [('denfert rochereau', 100), ('saint-jacques', 76)]
89 89
     """
90 90
     # TODO: Is there a better confidence measure?
91
-    normalized_query = tools.normalize_string(query)
92
-    normalized_choices = [tools.normalize_string(choice) for choice in choices]
91
+    normalized_query = tools.normalize_string(query).replace("saint", "st")
92
+    normalized_choices = [tools.normalize_string(choice).replace("saint", "st") for choice in choices]
93 93
 
94 94
     # Remove duplicates in the choices list
95 95
     unique_normalized_choices = tools.uniqify(normalized_choices)
@@ -97,13 +97,9 @@ def fuzzy_match(query, choices, limit=3, threshold=75):
97 97
     # Get the matches (normalized strings)
98 98
     # Keep only ``limit`` matches.
99 99
     matches = sorted(
100
-        [
101
-            (choice, len(choice))
102
-            for choice in tools.uniqify(unique_normalized_choices)
103
-            if choice in normalized_query
104
-        ],
100
+        [(choice, len(choice)) for choice in tools.uniqify(unique_normalized_choices) if choice in normalized_query],
105 101
         key=lambda x: x[1],
106
-        reverse=True
102
+        reverse=True,
107 103
     )
108 104
     if limit:
109 105
         matches = matches[:limit]
@@ -111,22 +107,66 @@ def fuzzy_match(query, choices, limit=3, threshold=75):
111 107
     # Update confidence
112 108
     if matches:
113 109
         max_confidence = max(match[1] for match in matches)
114
-        matches = [
115
-            (x[0], int(x[1] / max_confidence * 100))
116
-            for x in matches
117
-        ]
110
+        matches = [(x[0], int(x[1] / max_confidence * 100)) for x in matches]
118 111
 
119 112
     # Convert back matches to original strings
120 113
     # Also filter out matches below threshold
121
-    matches = [
122
-        (choices[normalized_choices.index(x[0])], x[1])
123
-        for x in matches
124
-        if x[1] >= threshold
125
-    ]
114
+    matches = [(choices[normalized_choices.index(x[0])], x[1]) for x in matches if x[1] >= threshold]
126 115
 
127 116
     return matches
128 117
 
129 118
 
119
+def guess_location_position(location, cities, constraint, must_match):
120
+    # try to find a city
121
+    # Find all fuzzy-matching cities
122
+    postal_code = None
123
+    insee_code = None
124
+    position = None
125
+
126
+    matched_cities = fuzzy_match(location, [x.name for x in cities], limit=None)
127
+    if matched_cities:
128
+        # Find associated postal codes
129
+        matched_postal_codes = []
130
+        for matched_city_name, _ in matched_cities:
131
+            postal_code_objects_for_city = [x for x in cities if x.name == matched_city_name]
132
+            insee_code = [pc.insee_code for pc in postal_code_objects_for_city][0]
133
+            matched_postal_codes.extend(pc.postal_code for pc in postal_code_objects_for_city)
134
+        # Try to match them with postal codes in config constraint
135
+        matched_postal_codes_in_config = set(matched_postal_codes) & set(constraint["postal_codes"])
136
+        if matched_postal_codes_in_config:
137
+            # If there are some matched postal codes which are also in
138
+            # config, use them preferentially. This avoid ignoring
139
+            # incorrectly some flats in cities with multiple postal
140
+            # codes, see #110.
141
+            postal_code = next(iter(matched_postal_codes_in_config))
142
+        else:
143
+            # Otherwise, simply take any matched postal code.
144
+            postal_code = matched_postal_codes[0]
145
+
146
+        # take the city position
147
+        for matched_city_name, _ in matched_cities:
148
+            postal_code_objects_for_city = [
149
+                x for x in cities if x.name == matched_city_name and x.postal_code == postal_code
150
+            ]
151
+            if len(postal_code_objects_for_city):
152
+                position = {
153
+                    "lat": postal_code_objects_for_city[0].lat,
154
+                    "lng": postal_code_objects_for_city[0].lng,
155
+                }
156
+                LOGGER.debug(("Found position %s using city %s."), position, matched_city_name)
157
+                break
158
+
159
+    if not postal_code and must_match:
160
+        postal_code = cities[0].postal_code
161
+        position = {
162
+            "lat": cities[0].lat,
163
+            "lng": cities[0].lng,
164
+        }
165
+        insee_code = cities[0].insee_code
166
+
167
+    return (postal_code, insee_code, position)
168
+
169
+
130 170
 def guess_postal_code(flats_list, constraint, config, distance_threshold=20000):
131 171
     """
132 172
     Try to guess the postal code from the location of the flats.
@@ -141,24 +181,27 @@ def guess_postal_code(flats_list, constraint, config, distance_threshold=20000):
141 181
 
142 182
     :return: An updated list of flats dict with guessed postal code.
143 183
     """
144
-    opendata = {
145
-        "postal_codes": data.load_data(PostalCode, constraint, config)
146
-    }
184
+    opendata = {"postal_codes": data.load_data(PostalCode, constraint, config)}
147 185
 
148 186
     for flat in flats_list:
149 187
         location = flat.get("location", None)
188
+        if not location:
189
+            addr = flat.get("address", None)
190
+            if addr:
191
+                location = addr["full_address"]
150 192
         if not location:
151 193
             # Skip everything if empty location
152 194
             LOGGER.info(
153
-                (
154
-                    "No location field for flat %s, skipping postal "
155
-                    "code lookup."
156
-                ),
157
-                flat["id"]
195
+                ("No location field for flat %s, skipping postal code lookup. (%s)"),
196
+                flat["id"],
197
+                flat.get("address"),
158 198
             )
159 199
             continue
160 200
 
161 201
         postal_code = None
202
+        insee_code = None
203
+        position = None
204
+
162 205
         # Try to find a postal code directly
163 206
         try:
164 207
             postal_code = re.search(r"[0-9]{5}", location)
@@ -166,86 +209,51 @@ def guess_postal_code(flats_list, constraint, config, distance_threshold=20000):
166 209
             postal_code = postal_code.group(0)
167 210
 
168 211
             # Check the postal code is within the db
169
-            assert postal_code in [x.postal_code
170
-                                   for x in opendata["postal_codes"]]
212
+            assert postal_code in [x.postal_code for x in opendata["postal_codes"]]
171 213
 
172
-            LOGGER.info(
173
-                "Found postal code in location field for flat %s: %s.",
174
-                flat["id"], postal_code
214
+            LOGGER.debug(
215
+                "Found postal code directly in location field for flat %s: %s.",
216
+                flat["id"],
217
+                postal_code,
175 218
             )
176 219
         except AssertionError:
177 220
             postal_code = None
178 221
 
179
-        # If not found, try to find a city
180
-        if not postal_code:
181
-            # Find all fuzzy-matching cities
182
-            matched_cities = fuzzy_match(
183
-                location,
184
-                [x.name for x in opendata["postal_codes"]],
185
-                limit=None
186
-            )
187
-            if matched_cities:
188
-                # Find associated postal codes
189
-                matched_postal_codes = []
190
-                for matched_city_name, _ in matched_cities:
191
-                    postal_code_objects_for_city = [
192
-                        x for x in opendata["postal_codes"]
193
-                        if x.name == matched_city_name
194
-                    ]
195
-                    matched_postal_codes.extend(
196
-                        pc.postal_code
197
-                        for pc in postal_code_objects_for_city
198
-                    )
199
-                # Try to match them with postal codes in config constraint
200
-                matched_postal_codes_in_config = (
201
-                    set(matched_postal_codes) & set(constraint["postal_codes"])
202
-                )
203
-                if matched_postal_codes_in_config:
204
-                    # If there are some matched postal codes which are also in
205
-                    # config, use them preferentially. This avoid ignoring
206
-                    # incorrectly some flats in cities with multiple postal
207
-                    # codes, see #110.
208
-                    postal_code = next(iter(matched_postal_codes_in_config))
209
-                else:
210
-                    # Otherwise, simply take any matched postal code.
211
-                    postal_code = matched_postal_codes[0]
212
-                LOGGER.info(
213
-                    ("Found postal code in location field through city lookup "
214
-                     "for flat %s: %s."),
<