Initial commit

This commit is contained in:
Lucas Verney 2017-04-03 17:29:29 +02:00
parent f060324bae
commit d7012e3834
No known key found for this signature in database
GPG Key ID: 75B45CF41F334690
31 changed files with 2518 additions and 131 deletions

407
.ci/pylintrc Normal file
View File

@ -0,0 +1,407 @@
[MASTER]
# Specify a configuration file.
#rcfile=
# Python code to execute, usually for sys.path manipulation such as
# pygtk.require().
#init-hook=
# Add files or directories to the blacklist. They should be base names, not
# paths.
ignore=CVS
# Add files or directories matching the regex patterns to the blacklist. The
# regex matches against base names, not paths.
ignore-patterns=
# Pickle collected data for later comparisons.
persistent=yes
# List of plugins (as comma separated values of python modules names) to load,
# usually to register additional checkers.
load-plugins=
# Use multiple processes to speed up Pylint.
jobs=1
# Allow loading of arbitrary C extensions. Extensions are imported into the
# active Python interpreter and may run arbitrary code.
unsafe-load-any-extension=no
# A comma-separated list of package or module names from where C extensions may
# be loaded. Extensions are loading into the active Python interpreter and may
# run arbitrary code
extension-pkg-whitelist=
# Allow optimization of some AST trees. This will activate a peephole AST
# optimizer, which will apply various small optimizations. For instance, it can
# be used to obtain the result of joining multiple strings with the addition
# operator. Joining a lot of strings can lead to a maximum recursion error in
# Pylint and this flag can prevent that. It has one side effect, the resulting
# AST will be different than the one from reality. This option is deprecated
# and it will be removed in Pylint 2.0.
optimize-ast=no
[MESSAGES CONTROL]
# Only show warnings with the listed confidence levels. Leave empty to show
# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED
confidence=
# Enable the message, report, category or checker with the given id(s). You can
# either give multiple identifier separated by comma (,) or put this option
# multiple time (only on the command line, not in the configuration file where
# it should appear only once). See also the "--disable" option for examples.
#enable=
# Disable the message, report, category or checker with the given id(s). You
# can either give multiple identifiers separated by comma (,) or put this
# option multiple times (only on the command line, not in the configuration
# file where it should appear only once).You can also use "--disable=all" to
# disable everything first and then reenable specific checks. For example, if
# you want to run only the similarities checker, you can use "--disable=all
# --enable=similarities". If you want to run only the classes checker, but have
# no Warning level messages displayed, use"--disable=all --enable=classes
# --disable=W"
disable=import-star-module-level,old-octal-literal,oct-method,print-statement,unpacking-in-except,parameter-unpacking,backtick,old-raise-syntax,old-ne-operator,long-suffix,dict-view-method,dict-iter-method,metaclass-assignment,next-method-called,raising-string,indexing-exception,raw_input-builtin,long-builtin,file-builtin,execfile-builtin,coerce-builtin,cmp-builtin,buffer-builtin,basestring-builtin,apply-builtin,filter-builtin-not-iterating,using-cmp-argument,useless-suppression,range-builtin-not-iterating,suppressed-message,no-absolute-import,old-division,cmp-method,reload-builtin,zip-builtin-not-iterating,intern-builtin,unichr-builtin,reduce-builtin,standarderror-builtin,unicode-builtin,xrange-builtin,coerce-method,delslice-method,getslice-method,setslice-method,input-builtin,round-builtin,hex-method,nonzero-method,map-builtin-not-iterating
[REPORTS]
# Set the output format. Available formats are text, parseable, colorized, msvs
# (visual studio) and html. You can also give a reporter class, eg
# mypackage.mymodule.MyReporterClass.
output-format=text
# Put messages in a separate file for each module / package specified on the
# command line instead of printing them on stdout. Reports (if any) will be
# written in a file name "pylint_global.[txt|html]". This option is deprecated
# and it will be removed in Pylint 2.0.
files-output=no
# Tells whether to display a full report or only the messages
reports=yes
# Python expression which should return a note less than 10 (10 is the highest
# note). You have access to the variables errors warning, statement which
# respectively contain the number of errors / warnings messages and the total
# number of statements analyzed. This is used by the global evaluation report
# (RP0004).
evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
# Template used to display messages. This is a python new-style format string
# used to format the message information. See doc for all details
#msg-template=
[BASIC]
# Good variable names which should always be accepted, separated by a comma
good-names=i,j,k,ex,Run,_,fh
# Bad variable names which should always be refused, separated by a comma
bad-names=foo,bar,baz,toto,tutu,tata
# Colon-delimited sets of names that determine each other's naming style when
# the name regexes allow several styles.
name-group=
# Include a hint for the correct naming format with invalid-name
include-naming-hint=no
# List of decorators that produce properties, such as abc.abstractproperty. Add
# to this list to register other decorators that produce valid properties.
property-classes=abc.abstractproperty
# Regular expression matching correct function names
function-rgx=[a-z_][a-z0-9_]{2,30}$
# Naming hint for function names
function-name-hint=[a-z_][a-z0-9_]{2,30}$
# Regular expression matching correct variable names
variable-rgx=[a-z_][a-z0-9_]{2,30}$
# Naming hint for variable names
variable-name-hint=[a-z_][a-z0-9_]{2,30}$
# Regular expression matching correct constant names
const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$
# Naming hint for constant names
const-name-hint=(([A-Z_][A-Z0-9_]*)|(__.*__))$
# Regular expression matching correct attribute names
attr-rgx=[a-z_][a-z0-9_]{2,30}$
# Naming hint for attribute names
attr-name-hint=[a-z_][a-z0-9_]{2,30}$
# Regular expression matching correct argument names
argument-rgx=[a-z_][a-z0-9_]{2,30}$
# Naming hint for argument names
argument-name-hint=[a-z_][a-z0-9_]{2,30}$
# Regular expression matching correct class attribute names
class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
# Naming hint for class attribute names
class-attribute-name-hint=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
# Regular expression matching correct inline iteration names
inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$
# Naming hint for inline iteration names
inlinevar-name-hint=[A-Za-z_][A-Za-z0-9_]*$
# Regular expression matching correct class names
class-rgx=[A-Z_][a-zA-Z0-9]+$
# Naming hint for class names
class-name-hint=[A-Z_][a-zA-Z0-9]+$
# Regular expression matching correct module names
module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
# Naming hint for module names
module-name-hint=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
# Regular expression matching correct method names
method-rgx=[a-z_][a-z0-9_]{2,30}$
# Naming hint for method names
method-name-hint=[a-z_][a-z0-9_]{2,30}$
# Regular expression which should only match function or class names that do
# not require a docstring.
no-docstring-rgx=^_
# Minimum line length for functions/classes that require docstrings, shorter
# ones are exempt.
docstring-min-length=-1
[ELIF]
# Maximum number of nested blocks for function / method body
max-nested-blocks=5
[FORMAT]
# Maximum number of characters on a single line.
max-line-length=100
# Regexp for a line that is allowed to be longer than the limit.
ignore-long-lines=^\s*(# )?<?https?://\S+>?$
# Allow the body of an if to be on the same line as the test if there is no
# else.
single-line-if-stmt=no
# List of optional constructs for which whitespace checking is disabled. `dict-
# separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}.
# `trailing-comma` allows a space between comma and closing bracket: (a, ).
# `empty-line` allows space-only lines.
no-space-check=trailing-comma,dict-separator
# Maximum number of lines in a module
max-module-lines=1000
# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
# tab).
indent-string=' '
# Number of spaces of indent required inside a hanging or continued line.
indent-after-paren=4
# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
expected-line-ending-format=
[LOGGING]
# Logging modules to check that the string format arguments are in logging
# function parameter format
logging-modules=logging
[MISCELLANEOUS]
# List of note tags to take in consideration, separated by a comma.
notes=FIXME,XXX,TODO
[SIMILARITIES]
# Minimum lines number of a similarity.
min-similarity-lines=4
# Ignore comments when computing similarities.
ignore-comments=yes
# Ignore docstrings when computing similarities.
ignore-docstrings=yes
# Ignore imports when computing similarities.
ignore-imports=no
[SPELLING]
# Spelling dictionary name. Available dictionaries: none. To make it working
# install python-enchant package.
spelling-dict=
# List of comma separated words that should not be checked.
spelling-ignore-words=
# A path to a file that contains private dictionary; one word per line.
spelling-private-dict-file=
# Tells whether to store unknown words to indicated private dictionary in
# --spelling-private-dict-file option instead of raising a message.
spelling-store-unknown-words=no
[TYPECHECK]
# Tells whether missing members accessed in mixin class should be ignored. A
# mixin class is detected if its name ends with "mixin" (case insensitive).
ignore-mixin-members=yes
# List of module names for which member attributes should not be checked
# (useful for modules/projects where namespaces are manipulated during runtime
# and thus existing member attributes cannot be deduced by static analysis. It
# supports qualified module names, as well as Unix pattern matching.
ignored-modules=
# List of class names for which member attributes should not be checked (useful
# for classes with dynamically set attributes). This supports the use of
# qualified names.
ignored-classes=optparse.Values,thread._local,_thread._local
# List of members which are set dynamically and missed by pylint inference
# system, and so shouldn't trigger E1101 when accessed. Python regular
# expressions are accepted.
generated-members=
# List of decorators that produce context managers, such as
# contextlib.contextmanager. Add to this list to register other decorators that
# produce valid context managers.
contextmanager-decorators=contextlib.contextmanager
[VARIABLES]
# Tells whether we should check for unused import in __init__ files.
init-import=no
# A regular expression matching the name of dummy variables (i.e. expectedly
# not used).
dummy-variables-rgx=(_+[a-zA-Z0-9]*?$)|dummy
# List of additional names supposed to be defined in builtins. Remember that
# you should avoid to define new builtins when possible.
additional-builtins=
# List of strings which can identify a callback function by name. A callback
# name must start or end with one of those strings.
callbacks=cb_,_cb
# List of qualified module names which can have objects that can redefine
# builtins.
redefining-builtins-modules=six.moves,future.builtins,builtins
[CLASSES]
# List of method names used to declare (i.e. assign) instance attributes.
defining-attr-methods=__init__,__new__,setUp
# List of valid names for the first argument in a class method.
valid-classmethod-first-arg=cls
# List of valid names for the first argument in a metaclass class method.
valid-metaclass-classmethod-first-arg=mcs
# List of member names, which should be excluded from the protected access
# warning.
exclude-protected=_asdict,_fields,_replace,_source,_make
[DESIGN]
# Maximum number of arguments for function / method
max-args=5
# Argument names that match this expression will be ignored. Default to name
# with leading underscore
ignored-argument-names=_.*
# Maximum number of locals for function / method body
max-locals=15
# Maximum number of return / yield for function / method body
max-returns=6
# Maximum number of branch for function / method body
max-branches=12
# Maximum number of statements in function / method body
max-statements=50
# Maximum number of parents for a class (see R0901).
max-parents=7
# Maximum number of attributes for a class (see R0902).
max-attributes=7
# Minimum number of public methods for a class (see R0903).
min-public-methods=2
# Maximum number of public methods for a class (see R0904).
max-public-methods=20
# Maximum number of boolean expressions in a if statement
max-bool-expr=5
[IMPORTS]
# Deprecated modules which should not be used, separated by a comma
deprecated-modules=regsub,TERMIOS,Bastion,rexec
# Create a graph of every (i.e. internal and external) dependencies in the
# given file (report RP0402 must not be disabled)
import-graph=
# Create a graph of external dependencies in the given file (report RP0402 must
# not be disabled)
ext-import-graph=
# Create a graph of internal dependencies in the given file (report RP0402 must
# not be disabled)
int-import-graph=
# Force import order to recognize a module as part of the standard
# compatibility libraries.
known-standard-library=
# Force import order to recognize a module as part of a third party library.
known-third-party=enchant
# Analyse import fallback blocks. This can be used to support both Python 2 and
# 3 compatible code, which means that the block might have code that exists
# only in one or another interpreter, leading to false positives when analysed.
analyse-fallback-blocks=no
[EXCEPTIONS]
# Exceptions that will emit a warning when being caught. Defaults to
# "Exception"
overgeneral-exceptions=Exception

5
.gitignore vendored
View File

@ -1,3 +1,6 @@
build build
*.json *.json
config.py *.pyc
*.swp
*.swo
*.db

21
LICENSE.md Normal file
View File

@ -0,0 +1,21 @@
The MIT License (MIT)
Copyright (c) 2017 Phyks (Lucas Verney)
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

115
README.md Normal file
View File

@ -0,0 +1,115 @@
Flatisfy
========
Flatisfy is your new companion to ease your search of a new housing :)
It uses [Weboob](http://weboob.org/) to get all the housing posts on most of
the websites offering housings posts, and then offers a bunch of pipelines to
filter and deduplicate the fetched housings.
It can be used as a command-line utility, but also exposes a web API and
visualisation, to browse through the results.
_Note_: It is targeted at French users (due to the currently supported
websites), and in particular at people living close to Paris, as I developped
it for my personal use, and am currently living in Paris :) Any feedback and
merge requests to better support other countries / cities are more than
welcome!
_Note_: In this repository and across the code, I am using the name "flat". I
use it as a placeholder for "housing" and consider both are interchangeable.
This code is not restricted to handling flats only!
## Getting started
1. Clone the repository.
2. Install required Python modules: `pip install -r requirements.txt`.
3. Init a configuration file: `python -m flatisfy init-config > config.json`.
Edit it according to your needs (see below).
4. Build the required data files:
`python -m flatisfy build-data --config config.json`.
5. Use it to `fetch` (and output a filtered JSON list of flats) or `import`
(into an SQLite database, for the web visualization) a list of flats
matching your criteria.
6. Use `python -m flatisfy serve --config config.json` to serve the web app.
## Configuration
List of configuration options:
* `data_directory` is the directory in which you want data files to be stored.
`null` is the default value and means default `XDG` location (typically
`~/.local/share/flatisfy/`)
* `max_entries` is the maximum number of entries to fetch **per Weboob
backend** (that is per housing website).
* `passes` is the number of passes to run on the data. First pass is a basic
filtering and using only the informations from the housings list page.
Second pass loads any possible information about the filtered flats and does
better filtering.
* `queries` is a list of queries defined in `flatboob` that should be fetched.
* `database` is an SQLAlchemy URI to a database file. Defaults to `null` which
means that it will store the database in the default location, in
`data_directory`.
* `navitia_api_key` is an API token for [Navitia](https://www.navitia.io/)
which is required to compute travel times.
### Constraints
You can specify constraints, under the `constraints` key. The available
constraints are:
* `area` (in m²), `bedrooms`, `cost` (in currency unit), `rooms`: this is a
tuple of `(min, max)` values, defining an interval in which the value should
lie. A `null` value means that any value is within this bound.
* `postal_codes` is a list of allowed postal codes. You should include any
postal code you want, and especially the postal codes close to the precise
location you want. You MUST provide some postal codes.
* `time_to` is a dictionary of places to compute travel time to them.
Typically,
```
"time_to": {
"foobar": {
"gps": [LAT, LNG],
"time": [min, max]
}
}
```
means that the housings must be between the `min` and `max` bounds (possibly
`null`) from the place identified by the GPS coordinates `LAT` and `LNG`
(latitude and longitude), and we call this place `foobar` in human-readable
form. Beware that `time` constraints are in **seconds**.
## OpenData
I am using the following datasets, available under `flatisfy/data_files`,
which covers Paris. If you want to run the script using some other location,
you might have to change these files by matching datasets.
* [LaPoste Hexasmal](https://datanova.legroupe.laposte.fr/explore/dataset/laposte_hexasmal/?disjunctive.code_commune_insee&disjunctive.nom_de_la_commune&disjunctive.code_postal&disjunctive.libell_d_acheminement&disjunctive.ligne_5) for the list of cities and postal codes in France.
* [RATP stations](https://data.ratp.fr/explore/dataset/positions-geographiques-des-stations-du-reseau-ratp/table/?disjunctive.stop_name&disjunctive.code_postal&disjunctive.departement) for the list of subway stations with their positions in Paris and nearby areas.
Both datasets are licensed under the Open Data Commons Open Database License
(ODbL): https://opendatacommons.org/licenses/odbl/.
## License
The content of this repository is licensed under an MIT license, unless
explicitly mentionned otherwise.
## Thanks
* [Weboob](http://weboob.org/)
* The OpenData providers listed above!
* Navitia for their really cool public transportation API.
* A lots of Python modules, required for this script (see `requirements.txt`).
* [Kresus](https://framagit.org/bnjbvr/kresus) which gave me part of the
original idea (at least proved me such software based on scraping can
achieve a high quality level :)

130
flat.py
View File

@ -1,130 +0,0 @@
# coding: utf-8
#!/usr/bin/env python3
import json
import os
import subprocess
import sys
from fuzzywuzzy import process as fuzzyprocess
import config
def pretty_json(json_str):
return json.dumps(json_str, indent=4, separators=(',', ': '),
sort_keys=True)
def preprocess_data():
if not os.path.isdir("build"):
os.mkdir("build")
if not os.path.isfile("build/ratp.json"):
ratp_data = []
with open("data/ratp.json", "r") as fh:
ratp_data = json.load(fh)
ratp_data = sorted(
list(set(
x["fields"]["stop_name"].lower() for x in ratp_data
))
)
with open("build/ratp.json", "w") as fh:
fh.write(pretty_json(ratp_data))
def fetch_flats_list():
flats_list = []
for query in config.QUERIES:
flatboob_output = subprocess.check_output(
["flatboob", "-n", "0", "-f", "json", "load", query]
)
flats_list.extend(json.loads(flatboob_output))
return flats_list
def remove_duplicates(flats_list):
unique_flats_list = []
ids = []
for flat in flats_list:
if flat["id"] in ids:
continue
ids.append(id)
unique_flats_list.append(flat)
return unique_flats_list
def sort_by(flats_list, key="cost"):
return sorted(flats_list, key=lambda x: x["cost"])
def refine_params(flats_list):
def filter_conditions(x):
is_ok = True
if "cost" in x:
cost = x["cost"]
is_ok = (
is_ok and
(cost < config.PARAMS["max_cost"] and
cost > config.PARAMS["min_cost"])
)
if "area" in x:
area = x["area"]
is_ok = (
is_ok and
(area < config.PARAMS["max_area"] and
area > config.PARAMS["min_area"])
)
return is_ok
return filter(filter_conditions, flats_list)
def match_ratp(flats_list):
ratp_stations = []
with open("build/ratp.json", "r") as fh:
ratp_stations = json.load(fh)
for flat in flats_list:
if "station" in flat and flat["station"]:
# There is some station fetched by flatboob, try to match it
flat["ratp_station"] = fuzzyprocess.extractOne(
flat["station"], ratp_stations
)
# TODO: Cross-check station location to choose the best fit
return flats_list
def main(dumpfile=None):
if dumpfile is None:
flats_list = fetch_flats_list()
else:
with open(dumpfile, "r") as fh:
flats_list = json.load(fh)
# First pass
flats_list = remove_duplicates(flats_list)
flats_list = sort_by(flats_list, "cost")
flats_list = refine_params(flats_list)
# TODO: flats_list = match_ratp(flats_list)
# TODO: Second pass, loading additional infos for each entry
return flats_list
if __name__ == "__main__":
if len(sys.argv) > 1:
dumpfile = sys.argv[1]
else:
dumpfile = None
try:
preprocess_data()
flats_list = main(dumpfile)
print(
pretty_json(flats_list)
)
except KeyboardInterrupt:
pass

5
flatisfy/__init__.py Normal file
View File

@ -0,0 +1,5 @@
# coding: utf-8
"""
``Flatisfy`` is a tool to help you find a new housing based on some criteria.
"""
__version__ = "0.1"

176
flatisfy/__main__.py Normal file
View File

@ -0,0 +1,176 @@
# coding: utf-8
"""
Main entry point of the Flatisfy code.
"""
from __future__ import absolute_import, print_function, unicode_literals
import argparse
import logging
import sys
import flatisfy.config
from flatisfy import cmds
from flatisfy import data
from flatisfy import tools
LOGGER = logging.getLogger("flatisfy")
def parse_args(argv=None):
"""
Create parser and parse arguments.
"""
parser = argparse.ArgumentParser(prog="Flatisfy",
description="Find the perfect flat.")
# Parent parser containing arguments common to any subcommand
parent_parser = argparse.ArgumentParser(add_help=False)
parent_parser.add_argument(
"--data-dir",
help="Location of Flatisfy data directory."
)
parent_parser.add_argument(
"--config",
help="Configuration file to use."
)
parent_parser.add_argument(
"--passes", choices=[0, 1, 2], type=int,
help="Number of passes to do on the filtered data."
)
parent_parser.add_argument(
"--max-entries", type=int,
help="Maximum number of entries to fetch."
)
parent_parser.add_argument(
"-v", "--verbose", action="store_true",
help="Verbose logging output."
)
parent_parser.add_argument(
"-vv", action="store_true",
help="Debug logging output."
)
# Subcommands
subparsers = parser.add_subparsers(
dest="cmd", help="Available subcommands"
)
# Build data subcommand
subparsers.add_parser(
"build-data", parents=[parent_parser],
help="Build necessary data"
)
# Init config subcommand
parser_init_config = subparsers.add_parser(
"init-config", parents=[parent_parser],
help="Initialize empty configuration."
)
parser_init_config.add_argument(
"output", nargs="?", help="Output config file. Use '-' for stdout."
)
# Fetch subcommand parser
subparsers.add_parser("fetch", parents=[parent_parser],
help="Fetch housings posts")
# Filter subcommand parser
parser_filter = subparsers.add_parser("filter", parents=[parent_parser],
help=(
"Filter housings posts. No "
"fetching of additional infos "
"is done."))
parser_filter.add_argument(
"input",
help="JSON dump of the housings post to filter."
)
# Import subcommand parser
subparsers.add_parser("import", parents=[parent_parser],
help="Import housing posts in database.")
# Serve subcommand parser
parser_serve = subparsers.add_parser("serve", parents=[parent_parser],
help="Serve the web app.")
parser_serve.add_argument("--port", type=int, help="Port to bind to.")
parser_serve.add_argument("--host", help="Host to listen on.")
return parser.parse_args(argv)
def main():
"""
Main module code.
"""
# Parse arguments
args = parse_args()
# Set logger
if args.vv:
logging.basicConfig(level=logging.DEBUG)
logging.getLogger('sqlalchemy.engine').setLevel(logging.DEBUG)
elif args.verbose:
logging.basicConfig(level=logging.INFO)
# sqlalchemy INFO level is way too loud, just stick with WARNING
logging.getLogger('sqlalchemy.engine').setLevel(logging.WARNING)
else:
logging.basicConfig(level=logging.WARNING)
logging.getLogger('sqlalchemy.engine').setLevel(logging.WARNING)
# Init-config command
if args.cmd == "init-config":
flatisfy.config.init_config(args.output)
sys.exit(0)
else:
# Load config
config = flatisfy.config.load_config(args)
if config is None:
LOGGER.error("Invalid configuration. Exiting. "
"Run init-config before if this is the first time "
"you run Flatisfy.")
sys.exit(1)
# Build data files
try:
if args.cmd == "build-data":
data.preprocess_data(config, force=True)
sys.exit(0)
else:
data.preprocess_data(config)
except flatisfy.exceptions.DataBuildError:
sys.exit(1)
# Fetch command
if args.cmd == "fetch":
# Fetch and filter flats list
flats_list, _ = cmds.fetch_and_filter(config)
# Sort by cost
flats_list = tools.sort_list_of_dicts_by(flats_list, "cost")
print(
tools.pretty_json(flats_list)
)
# Filter command
elif args.cmd == "filter":
# Load and filter flats list
flats_list = cmds.load_and_filter(args.input, config)
# Sort by cost
flats_list = tools.sort_list_of_dicts_by(flats_list, "cost")
print(
tools.pretty_json(flats_list)
)
# Import command
elif args.cmd == "import":
cmds.import_and_filter(config)
# Serve command
elif args.cmd == "serve":
cmds.serve(config)
if __name__ == "__main__":
try:
main()
except KeyboardInterrupt:
pass

110
flatisfy/cmds.py Normal file
View File

@ -0,0 +1,110 @@
# coding: utf-8
"""
Main commands available for flatisfy.
"""
from __future__ import absolute_import, print_function, unicode_literals
import flatisfy.filters
from flatisfy import database
from flatisfy.models import flat as flat_model
from flatisfy import fetch
from flatisfy import tools
from flatisfy.web import app as web_app
def fetch_and_filter(config):
"""
Fetch the available flats list. Then, filter it according to criteria.
:param config: A config dict.
:return: A tuple of the list of all matching flats and the list of ignored
flats.
"""
# TODO: Reduce load on housings listing websites
# Fetch flats list with flatboobs
flats_list = fetch.fetch_flats_list(config)
# Do a first pass with the available infos to try to remove as much
# unwanted postings as possible
if config["passes"] > 0:
flats_list, ignored_flats = flatisfy.filters.first_pass(flats_list,
config)
# Do a second pass to consolidate all the infos we found and make use of
# additional infos
if config["passes"] > 1:
# Load additional infos
for flat in flats_list:
details = fetch.fetch_details(flat["id"])
flat = tools.merge_dicts(flat, details)
flats_list, extra_ignored_flats = flatisfy.filters.second_pass(
flats_list, config
)
ignored_flats.extend(extra_ignored_flats)
return flats_list, ignored_flats
def load_and_filter(housing_file, config):
"""
Load the dumped flats list. Then, filter it according to criteria.
:param housing_file: The JSON file to load flats from.
:param config: A config dict.
:return: A tuple of the list of all matching flats and the list of ignored
flats.
"""
# Load flats list
flats_list = fetch.load_flats_list(housing_file)
# Do a first pass with the available infos to try to remove as much
# unwanted postings as possible
if config["passes"] > 0:
flats_list, ignored_flats = flatisfy.filters.first_pass(flats_list,
config)
# Do a second pass to consolidate all the infos we found
if config["passes"] > 1:
flats_list, extra_ignored_flats = flatisfy.filters.second_pass(
flats_list, config
)
ignored_flats.extend(extra_ignored_flats)
return flats_list, ignored_flats
def import_and_filter(config):
"""
Fetch the available flats list. Then, filter it according to criteria.
Finally, store it in the database.
:param config: A config dict.
:return: ``None``.
"""
# Fetch and filter flats list
flats_list, purged_list = fetch_and_filter(config)
# Create database connection
get_session = database.init_db(config["database"])
with get_session() as session:
for flat_dict in flats_list:
flat = flat_model.Flat.from_dict(flat_dict)
session.merge(flat)
for flat_dict in purged_list:
flat = flat_model.Flat.from_dict(flat_dict)
flat.status = flat_model.FlatStatus.purged
session.merge(flat)
def serve(config):
"""
Serve the web app.
:param config: A config dict.
:return: ``None``, long-running process.
"""
app = web_app.get_app(config)
# TODO: Make Bottle use logging module
app.run(host=config["host"], port=config["port"])

208
flatisfy/config.py Normal file
View File

@ -0,0 +1,208 @@
# coding: utf-8
"""
This module handles the configuration management for Flatisfy.
It loads the default configuration, then overloads it with the provided config
file and then overloads it with command-line options.
"""
from __future__ import absolute_import, print_function, unicode_literals
from builtins import str
import json
import logging
import os
import sys
import traceback
import appdirs
from flatisfy import tools
# Default configuration
DEFAULT_CONFIG = {
# Flatboob queries to fetch
"queries": [],
# Constraints to match
"constraints": {
"postal_codes": [], # List of postal codes
"area": (None, None), # (min, max) in m^2
"cost": (None, None), # (min, max) in currency unit
"rooms": (None, None), # (min, max)
"bedrooms": (None, None), # (min, max)
"time_to": {} # Dict mapping names to {"gps": [lat, lng],
# "time": (min, max) }
# Time is in seconds
},
# Navitia API key
"navitia_api_key": None,
# Number of filtering passes to run
"passes": 2,
# Maximum number of entries to fetch
"max_entries": None,
# Directory in wich data will be put. ``None`` is XDG default location.
"data_directory": None,
# SQLAlchemy URI to the database to use
"database": None,
# Web app port
"port": 8080,
# Web app host to listen on
"host": "127.0.0.1"
}
LOGGER = logging.getLogger(__name__)
def validate_config(config):
"""
Check that the config passed as argument is a valid configuration.
:param config: A config dictionary to fetch.
:return: ``True`` if the configuration is valid, ``False`` otherwise.
"""
def _check_constraints_bounds(bounds):
"""
Check the bounds for numeric constraints.
"""
assert len(bounds) == 2
assert all(
x is None or
(
(isinstance(x, int) or isinstance(x, float)) and
x >= 0
)
for x in bounds
)
if bounds[0] is not None and bounds[1] is not None:
assert bounds[1] > bounds[0]
try:
# Note: The traceback fetching code only handle single line asserts.
# Then, we disable line-too-long pylint check and E501 flake8 checks
# and use long lines whenever needed, in order to have the full assert
# message in the log output.
# pylint: disable=line-too-long
assert "postal_codes" in config["constraints"]
assert len(config["constraints"]["postal_codes"]) > 0
assert "area" in config["constraints"]
_check_constraints_bounds(config["constraints"]["area"])
assert "cost" in config["constraints"]
_check_constraints_bounds(config["constraints"]["cost"])
assert "rooms" in config["constraints"]
_check_constraints_bounds(config["constraints"]["rooms"])
assert "bedrooms" in config["constraints"]
_check_constraints_bounds(config["constraints"]["bedrooms"])
assert "time_to" in config["constraints"]
assert isinstance(config["constraints"]["time_to"], dict)
for name, item in config["constraints"]["time_to"].items():
assert isinstance(name, str)
assert "gps" in item
assert isinstance(item["gps"], list)
assert len(item["gps"]) == 2
assert "time" in item
_check_constraints_bounds(item["time"])
assert config["passes"] in [0, 1, 2]
assert config["max_entries"] is None or (isinstance(config["max_entries"], int) and config["max_entries"] > 0) # noqa: E501
assert config["data_directory"] is None or isinstance(config["data_directory"], str) # noqa: E501
assert config["database"] is None or isinstance(config["database"], str) # noqa: E501
assert isinstance(config["port"], int)
assert isinstance(config["host"], str)
return True
except (AssertionError, KeyError):
_, _, exc_traceback = sys.exc_info()
return traceback.extract_tb(exc_traceback)[-1][-1]
def load_config(args=None):
"""
Load the configuration from file.
:param args: An argparse args structure.
:return: The loaded config dict.
"""
LOGGER.info("Initializing configuration...")
# Default configuration
config_data = DEFAULT_CONFIG.copy()
# Load config from specified JSON
if args and getattr(args, "config", None):
LOGGER.debug("Loading configuration from %s.", args.config)
try:
with open(args.config, "r") as fh:
config_data.update(json.load(fh))
except (IOError, ValueError):
LOGGER.error(
"Unable to load configuration from file, "
"using default configuration."
)
# Overload config with arguments
if args and getattr(args, "passes", None) is not None:
LOGGER.debug(
"Overloading number of passes from CLI arguments: %d.",
args.passes
)
config_data["passes"] = args.passes
if args and getattr(args, "max_entries", None) is not None:
LOGGER.debug(
"Overloading maximum number of entries from CLI arguments: %d.",
args.max_entries
)
config_data["max_entries"] = args.max_entries
if args and getattr(args, "port", None) is not None:
LOGGER.debug("Overloading web app port: %d.", args.port)
config_data["port"] = args.port
if args and getattr(args, "host", None) is not None:
LOGGER.debug("Overloading web app host: %s.", args.host)
config_data["host"] = str(args.host)
# Handle data_directory option
if args and getattr(args, "data_dir", None) is not None:
LOGGER.debug("Overloading data directory from CLI arguments.")
config_data["data_directory"] = args.data_dir
elif config_data["data_directory"] is None:
config_data["data_directory"] = appdirs.user_data_dir(
"flatisfy",
"flatisfy"
)
LOGGER.debug("Using default XDG data directory: %s.",
config_data["data_directory"])
if config_data["database"] is None:
config_data["database"] = "sqlite:///" + os.path.join(
config_data["data_directory"],
"flatisfy.db"
)
config_validation = validate_config(config_data)
if config_validation is True:
LOGGER.info("Config has been fully initialized.")
return config_data
else:
LOGGER.error("Error in configuration: %s.", config_validation)
return None
def init_config(output=None):
"""
Initialize an empty configuration file.
:param output: File to output content to. Defaults to ``stdin``.
"""
config_data = DEFAULT_CONFIG.copy()
if output and output != "-":
with open(output, "w") as fh:
fh.write(tools.pretty_json(config_data))
else:
print(tools.pretty_json(config_data))

163
flatisfy/data.py Normal file
View File

@ -0,0 +1,163 @@
# coding : utf-8
"""
This module contains all the code related to building necessary data files from
the source opendata files.
"""
from __future__ import absolute_import, print_function, unicode_literals
import collections
import json
import logging
import os
import flatisfy.exceptions
LOGGER = logging.getLogger(__name__)
MODULE_DIR = os.path.dirname(os.path.realpath(__file__))
def _preprocess_ratp(output_dir):
"""
Build RATP file from the RATP data.
:param output_dir: Directory in which the output file should reside.
:return: ``True`` on successful build, ``False`` otherwise.
"""
ratp_data_raw = []
# Load opendata file
try:
with open(os.path.join(MODULE_DIR, "data_files/ratp.json"), "r") as fh:
ratp_data_raw = json.load(fh)
except (IOError, ValueError):
LOGGER.error("Invalid raw RATP opendata file.")
return False
# Process it
ratp_data = collections.defaultdict(list)
for item in ratp_data_raw:
stop_name = item["fields"]["stop_name"].lower()
ratp_data[stop_name].append(item["fields"]["coord"])
# Output it
with open(os.path.join(output_dir, "ratp.json"), "w") as fh:
json.dump(ratp_data, fh)
return True
def _preprocess_laposte(output_dir):
"""
Build JSON files from the postal codes data.
:param output_dir: Directory in which the output file should reside.
:return: ``True`` on successful build, ``False`` otherwise.
"""
raw_laposte_data = []
# Load opendata file
try:
with open(
os.path.join(MODULE_DIR, "data_files/laposte.json"), "r"
) as fh:
raw_laposte_data = json.load(fh)
except (IOError, ValueError):
LOGGER.error("Invalid raw LaPoste opendata file.")
return False
# Build postal codes to other infos file
postal_codes_data = {}
for item in raw_laposte_data:
try:
postal_codes_data[item["fields"]["code_postal"]] = {
"gps": item["fields"]["coordonnees_gps"],
"nom": item["fields"]["nom_de_la_commune"].title()
}
except KeyError:
LOGGER.info("Missing data for postal code %s, skipping it.",
item["fields"]["code_postal"])
with open(os.path.join(output_dir, "postal_codes.json"), "w") as fh:
json.dump(postal_codes_data, fh)
# Build city name to postal codes and other infos file
cities_data = {}
for item in raw_laposte_data:
try:
cities_data[item["fields"]["nom_de_la_commune"].title()] = {
"gps": item["fields"]["coordonnees_gps"],
"postal_code": item["fields"]["code_postal"]
}
except KeyError:
LOGGER.info("Missing data for city %s, skipping it.",
item["fields"]["nom_de_la_commune"])
with open(os.path.join(output_dir, "cities.json"), "w") as fh:
json.dump(cities_data, fh)
return True
def preprocess_data(config, force=False):
"""
Ensures that all the necessary data files have been built from the raw
opendata files.
:params config: A config dictionary.
:params force: Whether to force rebuild or not.
"""
LOGGER.debug("Data directory is %s.", config["data_directory"])
opendata_directory = os.path.join(config["data_directory"], "opendata")
try:
LOGGER.info("Ensuring the data directory exists.")
os.makedirs(opendata_directory)
LOGGER.debug("Created opendata directory at %s.", opendata_directory)
except OSError:
LOGGER.debug("Opendata directory already existed, doing nothing.")
is_built_ratp = os.path.isfile(
os.path.join(opendata_directory, "ratp.json")
)
if not is_built_ratp or force:
LOGGER.info("Building from RATP data.")
if not _preprocess_ratp(opendata_directory):
raise flatisfy.exceptions.DataBuildError("Error with RATP data.")
is_built_laposte = (
os.path.isfile(os.path.join(opendata_directory, "cities.json")) and
os.path.isfile(os.path.join(opendata_directory, "postal_codes.json"))
)
if not is_built_laposte or force:
LOGGER.info("Building from LaPoste data.")
if not _preprocess_laposte(opendata_directory):
raise flatisfy.exceptions.DataBuildError(
"Error with LaPoste data."
)
def load_data(data_type, config):
"""
Load a given built data file.
:param data_type: A valid data identifier.
:param config: A config dictionary.
:return: The loaded data. ``None`` if the query is incorrect.
"""
if data_type not in ["postal_codes", "cities", "ratp"]:
LOGGER.error("Invalid request. No %s data file.", data_type)
return None
opendata_directory = os.path.join(config["data_directory"], "opendata")
datafile_path = os.path.join(opendata_directory, "%s.json" % data_type)
data = {}
try:
with open(datafile_path, "r") as fh:
data = json.load(fh)
except IOError:
LOGGER.error("No such data file: %s.", datafile_path)
return None
except ValueError:
LOGGER.error("Invalid JSON data file: %s.", datafile_path)
return None
if len(data) == 0:
LOGGER.warning("Loading empty data for %s.", data_type)
return data

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,64 @@
# coding: utf-8
"""
This module contains functions related to the database.
"""
from __future__ import absolute_import, print_function, unicode_literals
import sqlite3
from contextlib import contextmanager
from sqlalchemy import event, create_engine
from sqlalchemy.engine import Engine
from sqlalchemy.orm import sessionmaker
import flatisfy.models.flat # noqa: F401
from flatisfy.database.base import BASE
@event.listens_for(Engine, "connect")
def set_sqlite_pragma(dbapi_connection, _):
"""
Auto enable foreign keys for SQLite.
"""
# Play well with other DB backends
if isinstance(dbapi_connection, sqlite3.Connection):
cursor = dbapi_connection.cursor()
cursor.execute("PRAGMA foreign_keys=ON")
cursor.close()
def init_db(database_uri=None):
"""
Initialize the database, ensuring tables exist etc.
:param database_uri: An URI describing an engine to use. Defaults to
in-memory SQLite database.
:return: A tuple of an SQLAlchemy session maker and the created engine.
"""
if database_uri is None:
database_uri = "sqlite:///:memory:"
engine = create_engine(database_uri)
BASE.metadata.create_all(engine, checkfirst=True)
Session = sessionmaker(bind=engine) # pylint: disable=invalid-name
@contextmanager
def get_session():
"""
Provide a transactional scope around a series of operations.
From [1].
[1]: http://docs.sqlalchemy.org/en/latest/orm/session_basics.html#when-do-i-construct-a-session-when-do-i-commit-it-and-when-do-i-close-it.
"""
session = Session()
try:
yield session
session.commit()
except:
session.rollback()
raise
finally:
session.close()
return get_session

10
flatisfy/database/base.py Normal file
View File

@ -0,0 +1,10 @@
# coding: utf-8
"""
This module contains the definition of the declarative SQLAlchemy base.
"""
from __future__ import absolute_import, print_function, unicode_literals
from sqlalchemy.ext.declarative import declarative_base
BASE = declarative_base()

View File

@ -0,0 +1,48 @@
# coding: utf-8
"""
This modules implements custom types in SQLAlchemy.
"""
from __future__ import absolute_import, print_function, unicode_literals
import json
import sqlalchemy.types as types
class StringyJSON(types.TypeDecorator):
"""
Stores and retrieves JSON as TEXT for SQLite.
From
https://avacariu.me/articles/2016/compiling-json-as-text-for-sqlite-with-sqlalchemy.
.. note :: The associated field is immutable. That is, changes to the data
(typically, changing the value of a dict field) will not trigger an update
on the SQL side upon ``commit`` as the reference to the object will not
have been updated. One should force the update by forcing an update of the
reference (by performing a ``copy`` operation on the dict for instance).
"""
impl = types.TEXT
def process_bind_param(self, value, dialect):
"""
TODO
"""
if value is not None:
value = json.dumps(value)
return value
def process_result_value(self, value, dialect):
"""
TODO
"""
if value is not None:
value = json.loads(value)
return value
# TypeEngine.with_variant says "use StringyJSON instead when
# connecting to 'sqlite'"
# pylint: disable=invalid-name
MagicJSON = types.JSON().with_variant(StringyJSON, 'sqlite')

13
flatisfy/exceptions.py Normal file
View File

@ -0,0 +1,13 @@
# coding : utf-8
"""
This module contains all the exceptions definitions for the Flatisfy-specific
exceptions.
"""
from __future__ import absolute_import, print_function, unicode_literals
class DataBuildError(Exception):
"""
Error occurring on building a data file.
"""
pass

76
flatisfy/fetch.py Normal file
View File

@ -0,0 +1,76 @@
# coding: utf-8
"""
This module contains all the code related to fetching and loading flats lists.
"""
from __future__ import absolute_import, print_function, unicode_literals
import json
import logging
import subprocess
LOGGER = logging.getLogger(__name__)
def fetch_flats_list(config):
"""
Fetch the available flats using the Flatboob / Weboob config.
:param config: A config dict.
:return: A list of all available flats.
"""
flats_list = []
for query in config["queries"]:
max_entries = config["max_entries"]
if max_entries is None:
max_entries = 0
LOGGER.info("Loading flats from query %s.", query)
flatboob_output = subprocess.check_output(
["../weboob/tools/local_run.sh", "../weboob/scripts/flatboob",
"-n", str(max_entries), "-f", "json", "load", query]
)
query_flats_list = json.loads(flatboob_output)
LOGGER.info("Fetched %d flats.", len(query_flats_list))
flats_list.extend(query_flats_list)
LOGGER.info("Fetched a total of %d flats.", len(flats_list))
return flats_list
def fetch_details(flat_id):
"""
Fetch the additional details for a flat using Flatboob / Weboob.
:param flat_id: ID of the flat to fetch details for.
:return: A flat dict with all the available data.
"""
LOGGER.info("Loading additional details for flat %s.", flat_id)
flatboob_output = subprocess.check_output(
["../weboob/tools/local_run.sh", "../weboob/scripts/flatboob",
"-f", "json", "info", flat_id]
)
flat_details = json.loads(flatboob_output)
LOGGER.info("Fetched details for flat %s.", flat_id)
if flat_details:
flat_details = flat_details[0]
return flat_details
def load_flats_list(json_file):
"""
Load a dumped flats list from JSON file.
:param json_file: The file to load housings list from.
:return: A list of all the flats in the dump file.
"""
flats_list = []
try:
LOGGER.info("Loading flats list from file %s", json_file)
with open(json_file, "r") as fh:
flats_list = json.load(fh)
LOGGER.info("Found %d flats.", len(flats_list))
except (IOError, ValueError):
LOGGER.error("File %s is not a valid dump file.", json_file)
return flats_list

View File

@ -0,0 +1,153 @@
# coding: utf-8
"""
This module contains all the filtering functions. It exposes ``first_pass`` and
``second_pass`` functions which are a set of filters applied during the first
pass and the second pass.
"""
from __future__ import absolute_import, print_function, unicode_literals
import logging
from flatisfy import tools
from flatisfy.filters import duplicates
from flatisfy.filters import metadata
LOGGER = logging.getLogger(__name__)
def refine_with_housing_criteria(flats_list, config):
"""
Filter a list of flats according to criteria.
Housings posts websites tend to return broader results that what was
actually asked for. Then, we should filter out the list to match the
user criteria, and avoid exposing unwanted flats.
:param flats_list: A list of flats dict to filter.
:param config: A config dict.
:return: A tuple of flats to keep and flats to delete.
"""
# For each flat, the associated `is_ok` value indicate whether it should be
# kept or discarded.
is_ok = [True for _ in flats_list]
for i, flat in enumerate(flats_list):
# Check postal code
postal_code = flat["flatisfy"].get("postal_code", None)
if (
postal_code and
postal_code not in config["constraints"]["postal_codes"]
):
LOGGER.info("Postal code for flat %s is out of range.", flat["id"])
is_ok[i] = is_ok[i] and False
# Check time_to
for place_name, time in flat["flatisfy"].get("time_to", {}).items():
is_within_interval = tools.is_within_interval(
time,
*(config["constraints"]["time_to"][place_name]["time"])
)
if not is_within_interval:
LOGGER.info("Flat %s is too far from place %s.",
flat["id"], place_name)
is_ok[i] = is_ok[i] and is_within_interval
# Check other fields
for field in ["area", "cost", "rooms", "bedrooms"]:
interval = config["constraints"][field]
is_within_interval = tools.is_within_interval(
flat.get(field, None),
*interval
)
if not is_within_interval:
LOGGER.info("%s for flat %s is out of range.",
field.capitalize(), flat["id"])
is_ok[i] = is_ok[i] and is_within_interval
return (
[
flat
for i, flat in enumerate(flats_list)
if is_ok[i]
],
[
flat
for i, flat in enumerate(flats_list)
if not is_ok[i]
]
)
def first_pass(flats_list, config):
"""
First filtering pass.
Flatboob only fetches data from the listing of the available housing. Then,
we should do a first pass to filter based on the already available data and
only request more data for the remaining housings.
:param flats_list: A list of flats dict to filter.
:param config: A config dict.
:return: A tuple of processed flats and purged flats.
"""
LOGGER.info("Running first filtering pass.")
# Handle duplicates based on ids
# Just remove them (no merge) as they should be the exact same object.
flats_list = duplicates.detect(
flats_list, key="id", merge=False
)
# Also merge duplicates based on url (these may come from different
# flatboob backends)
# This is especially useful as some websites such as entreparticuliers
# contains a lot of leboncoin housings posts.
flats_list = duplicates.detect(
flats_list, key="url", merge=True
)
# Add the flatisfy metadata entry
flats_list = metadata.init(flats_list)
# Guess the postal codes
flats_list = metadata.guess_postal_code(flats_list, config)
# Try to match with stations
flats_list = metadata.guess_stations(flats_list, config)
# Remove returned housing posts that do not match criteria
flats_list, purged_list = refine_with_housing_criteria(flats_list, config)
return (flats_list, purged_list)
def second_pass(flats_list, config):
"""
Second filtering pass.
This pass is expected to have as most information as possible on the
available housings. Plus it runs after first pass which already
consolidated data.
It should consolidate everything and try to extract as many data as
possible from the fetched housings.
:param flats_list: A list of flats dict to filter.
:param config: A config dict.
:return: A tuple of processed flats and purged flats.
"""
LOGGER.info("Running second filtering pass.")
# Assumed to run after first pass, so there should be no obvious duplicates
# left and we already tried to find postal code and nearby stations.
# Confirm postal code
flats_list = metadata.guess_postal_code(flats_list, config)
# TODO: Guess the address
# Better match with stations (confirm and check better)
flats_list = metadata.guess_stations(flats_list, config)
# Compute travel time to specified points
flats_list = metadata.compute_travel_times(flats_list, config)
# Remove returned housing posts that do not match criteria
flats_list, purged_list = refine_with_housing_criteria(flats_list, config)
return (flats_list, purged_list)

View File

@ -0,0 +1,56 @@
# coding: utf-8
"""
Filtering functions to detect and merge duplicates.
"""
from __future__ import absolute_import, print_function, unicode_literals
import collections
from flatisfy import tools
def detect(flats_list, key="id", merge=True):
"""
Detect obvious duplicates within a given list of flats.
There may be duplicates found, as some queries could overlap (especially
since when asking for a given place, websites tend to return housings in
nearby locations as well). We need to handle them, by either deleting the
duplicates (``merge=False``) or merging them together in a single flat
object.
:param flats_list: A list of flats dicts.
:param key: The flat dicts key on which the duplicate detection should be
done.
:param merge: Whether the found duplicates should be merged or we should
only keep one of them.
:return: A deduplicated list of flat dicts.
"""
# TODO: Keep track of found duplicates?
# ``seen`` is a dict mapping aggregating the flats by the deduplication
# keys. We basically make buckets of flats for every key value. Flats in
# the same bucket should be merged together afterwards.
seen = collections.defaultdict(list)
for flat in flats_list:
seen[flat.get(key, None)].append(flat)
# Generate the unique flats list based on these buckets
unique_flats_list = []
for flat_key, matching_flats in seen.items():
if flat_key is None:
# If the key is None, it means Weboob could not load the data. In
# this case, we consider every matching item as being independant
# of the others, to avoid over-deduplication.
unique_flats_list.extend(matching_flats)
else:
# Otherwise, check the policy
if merge:
# If a merge is requested, do the merge
unique_flats_list.append(
tools.merge_dicts(*matching_flats)
)
else:
# Otherwise, just keep any of them
unique_flats_list.append(matching_flats[0])
return unique_flats_list

View File

@ -0,0 +1,349 @@
# coding: utf-8
"""
Filtering functions to handle flatisfy-specific metadata.
This includes functions to guess metadata (postal codes, stations) from the
actual fetched data.
"""
from __future__ import absolute_import, print_function, unicode_literals
import logging
import re
from flatisfy import data
from flatisfy import tools
LOGGER = logging.getLogger(__name__)
def init(flats_list):
"""
Create a flatisfy key containing a dict of metadata fetched by flatisfy for
each flat in the list.
:param flats_list: A list of flats dict.
:return: The updated list
"""
for flat in flats_list:
if "flatisfy" not in flat:
flat["flatisfy"] = {}
return flats_list
def fuzzy_match(query, choices, limit=3, threshold=75):
"""
Custom search for the best element in choices matching the query.
:param query: The string to match.
:param choices: The list of strings to match with.
:param limit: The maximum number of items to return.
:param threshold: The score threshold to use.
:return: Tuples of matching items and associated confidence.
.. note :: This function works by removing any fancy character from the
``query`` and ``choices`` strings (replacing any non alphabetic and non
numeric characters by space), converting to lower case and normalizing them
(collapsing multiple spaces etc). It also converts any roman numerals to
decimal system. It then compares the string and look for the longest string
in ``choices`` which is a substring of ``query``. The longest one gets a
confidence of 100. The shorter ones get a confidence proportional to their
length.
.. seealso :: flatisfy.tools.normalize_string
.. todo :: Is there a better confidence measure?
:Example:
>>> match("Paris 14ème", ["Ris", "ris", "Paris 14"], limit=1)
[("Paris 14", 100)
>>> match( \
"Saint-Jacques, Denfert-Rochereau (Colonel Rol-Tanguy), " \
"Mouton-Duvernet", \
["saint-jacques", "denfert rochereau", "duvernet", "toto"], \
limit=4 \
)
[('denfert rochereau', 100), ('saint-jacques', 76)]
"""
normalized_query = tools.normalize_string(query)
normalized_choices = [tools.normalize_string(choice) for choice in choices]
# Remove duplicates in the choices list
unique_normalized_choices = tools.uniqify(normalized_choices)
# Get the matches (normalized strings)
# Keep only ``limit`` matches.
matches = sorted(
[
(choice, len(choice))
for choice in tools.uniqify(unique_normalized_choices)
if choice in normalized_query
],
key=lambda x: x[1],
reverse=True
)[:limit]
# Update confidence
if matches:
max_confidence = max(match[1] for match in matches)
matches = [
(x[0], int(x[1] / max_confidence * 100))
for x in matches
]
# Convert back matches to original strings
# Also filter out matches below threshold
matches = [
(choices[normalized_choices.index(x[0])], x[1])
for x in matches
if x[1] >= threshold
]
return matches
def guess_postal_code(flats_list, config, distance_threshold=20000):
"""
Try to guess the postal code from the location of the flats.
:param flats_list: A list of flats dict.
:param config: A config dict.
:param distance_threshold: Maximum distance in meters between the
constraint postal codes (from config) and the one found by this function,
to avoid bad fuzzy matching. Can be ``None`` to disable thresholding.
:return: An updated list of flats dict with guessed postal code.
"""
opendata = {
"cities": data.load_data("cities", config),
"postal_codes": data.load_data("postal_codes", config)
}
for flat in flats_list:
location = flat.get("location", None)
if not location:
# Skip everything if empty location
LOGGER.info(
(
"No location field for flat %s, skipping postal "
"code lookup."
),
flat["id"]
)
continue
postal_code = None
# Try to find a postal code directly
try:
postal_code = re.search(r"[0-9]{5}", location)
assert postal_code is not None
postal_code = postal_code.group(0)
# Check the postal code is within the db
assert postal_code in opendata["postal_codes"]
LOGGER.info(
"Found postal code in location field for flat %s: %s.",
flat["id"], postal_code
)
except AssertionError as e:
postal_code = None
# If not found, try to find a city
if not postal_code:
matched_city = fuzzy_match(
location,
opendata["cities"].keys(),
limit=1
)
if matched_city:
# Store the matching postal code
matched_city = matched_city[0]
matched_city_name = matched_city[0]
postal_code = (
opendata["cities"][matched_city_name]["postal_code"]
)
LOGGER.info(
("Found postal code in location field through city lookup "
"for flat %s: %s."),
flat["id"], postal_code
)
# Check that postal code is not too far from the ones listed in config,
# limit bad fuzzy matching
if postal_code and distance_threshold:
distance = min(
tools.distance(
opendata["postal_codes"][postal_code]["gps"],
opendata["postal_codes"][constraint]["gps"],
)
for constraint in config["constraints"]["postal_codes"]
)
if distance > distance_threshold:
LOGGER.info(
("Postal code %s found for flat %s is off-constraints. "
"Min distance is %f."),
postal_code, flat["id"], distance
)
postal_code = None
# Store it
if postal_code:
existing_postal_code = flat["flatisfy"].get("postal_code", None)
if existing_postal_code and existing_postal_code != postal_code:
LOGGER.warning(
"Replacing previous postal code %s by %s for flat %s.",
existing_postal_code, postal_code, flat["id"]
)
flat["flatisfy"]["postal_code"] = postal_code
else:
LOGGER.info("No postal code found for flat %s.", flat["id"])
return flats_list
def guess_stations(flats_list, config, distance_threshold=1500):
"""
Try to match the station field with a list of available stations nearby.
:param flats_list: A list of flats dict.
:param config: A config dict.
:param distance_threshold: Maximum distance (in meters) between the center
of the postal code and the station to consider it ok.
:return: An updated list of flats dict with guessed nearby stations.
"""
opendata = {
"postal_codes": data.load_data("postal_codes", config),
"stations": data.load_data("ratp", config)
}
for flat in flats_list:
flat_station = flat.get("station", None)
# TODO: Use flat location field as well?
if not flat_station:
# Skip everything if empty station
LOGGER.info(
"No station field for flat %s, skipping stations lookup.",
flat["id"]
)
continue
matched_stations = fuzzy_match(
flat_station,
opendata["stations"].keys(),
limit=10,
threshold=50
)
# Filter out the stations that are obviously too far and not well
# guessed
good_matched_stations = []
postal_code = flat["flatisfy"].get("postal_code", None)
if postal_code:
# If there is a postal code, check that the matched station is
# closed to it
postal_code_gps = opendata["postal_codes"][postal_code]["gps"]
for station in matched_stations:
# opendata["stations"] is a dict mapping station names to list
# of coordinates, for efficiency. Note that multiple stations
# with the same name exist in a city, hence the list of
# coordinates.
for station_gps in opendata["stations"][station[0]]:
distance = tools.distance(station_gps, postal_code_gps)
if distance < distance_threshold:
# If at least one of the coordinates for a given
# station is close enough, that's ok and we can add
# the station
good_matched_stations.append({
"name": station[0],
"confidence": station[1],
"gps": station_gps
})
break
LOGGER.debug(
"Station %s is too far from flat %s, discarding it.",
station[0], flat["id"]
)
else:
LOGGER.info(
("No postal code for flat %s, keeping all the matched "
"stations with half confidence."),
flat["id"]
)
# Otherwise, we keep every matching station but with half
# confidence
good_matched_stations = [
{
"name": station[0],
"confidence": station[1] * 0.5,
"gps": station_gps
}
for station in matched_stations
for station_gps in opendata["stations"][station[0]]
]
# Store matched stations and the associated confidence
LOGGER.info(
"Found stations for flat %s: %s.",
flat["id"],
", ".join(x["name"] for x in good_matched_stations)
)
# TODO: Handle update (second pass)
flat["flatisfy"]["matched_stations"] = good_matched_stations
return flats_list
def compute_travel_times(flats_list, config):
"""
Compute the travel time between each flat and the points listed in the
constraints.
:param flats_list: A list of flats dict.
:param config: A config dict.
:return: An updated list of flats dict with computed travel times.
.. note :: Requires a Navitia or CityMapper API key in the config.
"""
for flat in flats_list:
if not flat["flatisfy"].get("matched_stations", []):
# Skip any flat without matched stations
LOGGER.info(
"Skipping travel time computation for flat %s. No matched "
"stations.",
flat["id"]
)
continue
if "time_to" not in flat["flatisfy"]:
# Ensure time_to key is initialized
flat["flatisfy"]["time_to"] = {}
# For each place, loop over the stations close to the flat, and find
# the minimum travel time.
for place_name, place in config["constraints"]["time_to"].items():
time_to_place = None
for station in flat["flatisfy"]["matched_stations"]:
time_from_station = tools.get_travel_time_between(
station["gps"],
place["gps"],
config
)
if time_from_station and (time_from_station < time_to_place or
time_to_place is None):
time_to_place = time_from_station
if time_to_place:
LOGGER.info(
"Travel time between %s and flat %s is %ds.",
place_name, flat["id"], time_to_place
)
flat["flatisfy"]["time_to"][place_name] = time_to_place
return flats_list

View File

101
flatisfy/models/flat.py Normal file
View File

@ -0,0 +1,101 @@
# coding: utf-8
"""
This modules defines an SQLAlchemy ORM model for a flat.
"""
# pylint: disable=invalid-name,too-few-public-methods
from __future__ import absolute_import, print_function, unicode_literals
import enum
from sqlalchemy import Column, DateTime, Enum, Float, String, Text
from flatisfy.database.base import BASE
from flatisfy.database.types import MagicJSON
class FlatStatus(enum.Enum):
"""
An enum of the possible status for a flat entry.
"""
purged = -10
new = 0
contacted = 10
answer_no = 20
answer_yes = 21
class Flat(BASE):
"""
SQLAlchemy ORM model to store a flat.
"""
__tablename__ = "flats"
# Weboob data
id = Column(String, primary_key=True)
area = Column(Float)
bedrooms = Column(Float)
cost = Column(Float)
currency = Column(String)
date = Column(DateTime)
details = Column(MagicJSON)
location = Column(String)
phone = Column(String)
photos = Column(MagicJSON)
rooms = Column(Float)
station = Column(String)
text = Column(Text)
title = Column(String)
url = Column(String)
# Flatisfy data
# TODO: Should be in another table with relationships
flatisfy_stations = Column(MagicJSON)
flatisfy_postal_code = Column(String)
flatisfy_time_to = Column(MagicJSON)
# Status
status = Column(Enum(FlatStatus), default=FlatStatus.new)
@staticmethod
def from_dict(flat_dict):
"""
Create a Flat object from a flat dict as manipulated by the filtering
pass.
"""
# Handle flatisfy metadata
flat_dict = flat_dict.copy()
flat_dict["flatisfy_stations"] = (
flat_dict["flatisfy"].get("matched_stations", None)
)
flat_dict["flatisfy_postal_code"] = (
flat_dict["flatisfy"].get("postal_code", None)
)
flat_dict["flatisfy_time_to"] = (
flat_dict["flatisfy"].get("time_to", None)
)
del flat_dict["flatisfy"]
# Handle date field
flat_dict["date"] = None # TODO
flat_object = Flat()
flat_object.__dict__.update(flat_dict)
return flat_object
def __repr__(self):
return "<Flat(id=%s, url=%s)>" % (self.id, self.url)
def json_api_repr(self):
"""
Return a dict representation of this flat object that is JSON
serializable.
"""
flat_repr = {
k: v
for k, v in self.__dict__.items()
if not k.startswith("_")
}
flat_repr["status"] = str(flat_repr["status"])
return flat_repr

239
flatisfy/tools.py Normal file
View File

@ -0,0 +1,239 @@
# coding: utf-8
"""
This module contains basic utility functions, such as pretty printing of JSON
output, checking that a value is within a given interval etc.
"""
from __future__ import (
absolute_import, division, print_function, unicode_literals
)
import datetime
import json
import logging
import math
import re
import requests
import unidecode
LOGGER = logging.getLogger(__name__)
def pretty_json(data):
"""
Pretty JSON output.
:param data: The data to dump as pretty JSON.
:return: The pretty printed JSON dump.
:Example:
>>> print(pretty_json({"toto": "ok", "foo": "bar"}))
{
"foo": "bar",
"toto": "ok"
}
"""
return json.dumps(data, indent=4, separators=(',', ': '),
sort_keys=True)
def is_within_interval(value, min_value=None, max_value=None):
"""
Check whether a variable is within a given interval. Assumes the value is
always ok with respect to a `None` bound. If the `value` is `None`, it is
always within the bounds.
:param value: The value to check. Can be ``None``.
:param min_value: The lower bound.
:param max_value: The upper bound.
:return: ``True`` if the value is ``None``. ``True`` or ``False`` whether
the value is within the given interval or not.
.. note:: A value is always within a ``None`` bound.
:Example:
>>> is_within_interval(None)
True
>>> is_within_interval(None, 0, 10)
True
>>> is_within_interval(2, None, None)
True
>>> is_within_interval(2, None, 3)
True
>>> is_within_interval(2, 1, None)
True
>>> is_within_interval(2, 1, 3)
True
>>> is_within_interval(2, 4, 7)
False
>>> is_within_interval(2, 4, 1)
False
"""
checks = []
if value and min_value:
checks.append(value >= min_value)
if value and max_value:
checks.append(value <= max_value)
return all(checks)
def normalize_string(string):
"""
Normalize the given string for matching.
.. todo :: Convert romanian numerals to decimal
:Example:
>>> normalize_string("tétéà 14ème-XIV, foobar")
'tetea 14eme xiv, foobar'
"""
# ASCIIfy the string
string = unidecode.unidecode(string)
# Replace any non-alphanumeric character by space
# Keep some basic punctuation to keep syntaxic units
string = re.sub(r"[^a-zA-Z0-9,;:]", " ", string)
# Convert to lowercase
string = string.lower()
# Collapse multiple spaces, replace tabulations and newlines by space
string = re.sub(r"\s+", " ", string)
return string
def uniqify(some_list):
"""
Filter out duplicates from a given list.
:Example:
>>> uniqify([1, 2, 2, 3])
[1, 2, 3]
"""
return list(set(some_list))
def distance(gps1, gps2):
"""
Compute the distance between two tuples of latitude and longitude.
:param gps1: First tuple of (latitude, longitude).
:param gps2: Second tuple of (latitude, longitude).
:return: The distance in meters.
:Example:
>>> int(distance([48.86786647303717, 2.19368117495212], \
[48.95314107920405, 2.3368043817358464]))
14117
"""
lat1 = math.radians(gps1[0])
long1 = math.radians(gps1[1])
lat2 = math.radians(gps2[0])
long2 = math.radians(gps2[1])
# pylint: disable=invalid-name
a = (
math.sin((lat2 - lat1) / 2.0)**2 +
math.cos(lat1) * math.cos(lat2) * math.sin((long2 - long1) / 2.0)**2
)
c = 2.0 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
earth_radius = 6371000
return earth_radius * c
def sort_list_of_dicts_by(flats_list, key):
"""
Sort a list of dicts according to a given field common to all the dicts.
:param flats_list: List of dicts to sort.
:param key: The key of the dict items to sort on.
:return: A sorted list.
:Example:
>>> sort_list_of_dicts_by([{1: 2}, {1: 1}], 1)
[{1: 1}, {1: 2}]
"""
return sorted(flats_list, key=lambda x: x[key])
def merge_dicts(*args):
"""
Merge the two flats passed as argument in a single flat dict object.
"""
if len(args) == 1:
return args[0]
else:
flat1, flat2 = args[:2]
merged_flat = {}
for k, value2 in flat2.items():
value1 = flat1.get(k, None)
if value1 is None:
# flat1 has empty matching field, just keep the flat2 field
merged_flat[k] = value2
elif value2 is None:
# flat2 field is empty, just keep the flat1 field
merged_flat[k] = value1
else:
# Any other case, we should merge
# TODO: Do the merge
merged_flat[k] = value1
return merge_dicts(merged_flat, *args[2:])
def get_travel_time_between(latlng_from, latlng_to, config):
"""
Query the Navitia API to get the travel time between two points identified
by their latitude and longitude.
:param latlng_from: A tuple of (latitude, longitude) for the starting
point.
:param latlng_to: A tuple of (latitude, longitude) for the destination.
:return: The travel time in seconds. Returns ``None`` if it could not fetch
it.
.. note :: Uses the Navitia API. Requires a ``navitia_api_key`` field to be
filled-in in the ``config``.
"""
NAVITIA_ENDPOINT = "https://api.navitia.io/v1/coverage/fr-idf/journeys"
time = None
# Check that Navitia API key is available
if config["navitia_api_key"]:
payload = {
"from": "%s;%s" % (latlng_from[1], latlng_from[0]),
"to": "%s;%s" % (latlng_to[1], latlng_to[0]),
"datetime": datetime.datetime.now().isoformat(),
"count": 1
}
try:
# Do the query to Navitia API
req = requests.get(
NAVITIA_ENDPOINT, params=payload,
auth=(config["navitia_api_key"], "")
)
req.raise_for_status()
time = req.json()["journeys"][0]["durations"]["total"]
except (requests.exceptions.RequestException,
ValueError, IndexError, KeyError) as e:
# Ignore any possible exception
LOGGER.warning(
"An exception occurred during travel time lookup on "
"Navitia: %s.",
str(e)
)
else:
LOGGER.warning(
"No API key available for travel time lookup. Please provide "
"a Navitia API key. Skipping travel time lookup."
)
return time

0
flatisfy/web/__init__.py Normal file
View File

53
flatisfy/web/app.py Normal file
View File

@ -0,0 +1,53 @@
# coding: utf-8
"""
This module contains the definition of the Bottle web app.
"""
from __future__ import (
absolute_import, division, print_function, unicode_literals
)
import os
import bottle
from flatisfy import database
from flatisfy.web.routes import api as api_routes
from flatisfy.web.dbplugin import DatabasePlugin
def _serve_static_file(filename):
"""
Helper function to serve static file.
"""
return bottle.static_file(
filename,
root=os.path.join(
os.path.dirname(os.path.realpath(__file__)),
"static"
)
)
def get_app(config):
"""
Get a Bottle app instance with all the routes set-up.
:return: The built bottle app.
"""
get_session = database.init_db(config["database"])
app = bottle.default_app()
app.install(DatabasePlugin(get_session))
# API v1 routes
app.route("/api/v1/", "GET", api_routes.index_v1)
app.route("/api/v1/flats", "GET", api_routes.flats_v1)
app.route("/api/v1/flat/:id", "GET", api_routes.flat_v1)
# Index
app.route("/", "GET", lambda: _serve_static_file("index.html"))
# Static files
app.route("/static/<filename:path>", "GET", _serve_static_file)
return app

58
flatisfy/web/dbplugin.py Normal file
View File

@ -0,0 +1,58 @@
# coding: utf-8
"""
This module contains a Bottle plugin to pass the database argument to any route
which needs it.
"""
from __future__ import (
absolute_import, division, print_function, unicode_literals
)
import functools
import inspect
import bottle
class DatabasePlugin(object):
name = 'database'
api = 2
KEYWORD = "db"
def __init__(self, get_session):
"""
:param keyword: Keyword used to inject session database in a route
:param create_session: SQLAlchemy session maker created with the
'sessionmaker' function. Will create its own if undefined.
"""
self.get_session = get_session
def setup(self, app):
"""
Make sure that other installed plugins don't affect the same
keyword argument and check if metadata is available.
"""
for other in app.plugins:
if not isinstance(other, DatabasePlugin):
continue
else:
raise bottle.PluginError(
"Found another conflicting Database plugin."
)
def apply(self, callback, route):
try:
callback_args = inspect.signature(route.callback).parameters
except AttributeError:
# inspect.signature does not exist on older Python
callback_args = inspect.getargspec(route.callback).args
if self.KEYWORD not in callback_args:
return callback
else:
with self.get_session() as session:
kwargs = {}
kwargs[self.KEYWORD] = session
return functools.partial(callback, **kwargs)
Plugin = DatabasePlugin

View File

View File

@ -0,0 +1,47 @@
# coding: utf-8
"""
This module contains the definition of the web app API routes.
"""
from __future__ import (
absolute_import, division, print_function, unicode_literals
)
from flatisfy.models import flat as flat_model
def index_v1():
"""
API v1 index route:
GET /api/v1/
"""
return {
"flats": "/api/v1/flats"
}
def flats_v1(db):
"""
API v1 flats route:
GET /api/v1/flats
"""
flats = [
flat.json_api_repr()
for flat in db.query(flat_model.Flat).all()
]
return {
"data": flats
}
def flat_v1(id, db):
"""
API v1 flat route:
GET /api/v1/flat/:id
"""
flat = db.query(flat_model.Flat).filter_by(id=id).first()
return {
"data": flat.json_api_repr()
}

View File

@ -0,0 +1,30 @@
<!doctype html>
<html lang="fr">
<head>
<meta charset="utf-8">
<title>Flatisfy</title>
<script src="https://unpkg.com/vue"></script>
</head>
<body>
<div id="app">
<h1>Flatisfy</h1>
<table>
<thead>
<tr>
<th>Titre</th>
<th>Lien</th>
</tr>
</thead>
<tbody>
</tbody>
</table>
</div>
<script type="text/javascript">
var app = new Vue({
el: '#app',
data: {
}
})
</script>
</body>
</html>

3
hooks/pre-commit Executable file
View File

@ -0,0 +1,3 @@
#!/bin/sh
pylint --rcfile=.ci/pylintrc flatisfy

8
requirements.txt Normal file
View File

@ -0,0 +1,8 @@
appdirs
bottle
bottle-sqlalchemy
enum34
future
request
sqlalchemy
unidecode