Initial commit

This commit is contained in:
Lucas Verney 2017-04-03 17:29:29 +02:00
parent f060324bae
commit d7012e3834
No known key found for this signature in database
GPG Key ID: 75B45CF41F334690
31 changed files with 2518 additions and 131 deletions

407
.ci/pylintrc Normal file
View File

@ -0,0 +1,407 @@
[MASTER]
# Specify a configuration file.
#rcfile=
# Python code to execute, usually for sys.path manipulation such as
# pygtk.require().
#init-hook=
# Add files or directories to the blacklist. They should be base names, not
# paths.
ignore=CVS
# Add files or directories matching the regex patterns to the blacklist. The
# regex matches against base names, not paths.
ignore-patterns=
# Pickle collected data for later comparisons.
persistent=yes
# List of plugins (as comma separated values of python modules names) to load,
# usually to register additional checkers.
load-plugins=
# Use multiple processes to speed up Pylint.
jobs=1
# Allow loading of arbitrary C extensions. Extensions are imported into the
# active Python interpreter and may run arbitrary code.
unsafe-load-any-extension=no
# A comma-separated list of package or module names from where C extensions may
# be loaded. Extensions are loading into the active Python interpreter and may
# run arbitrary code
extension-pkg-whitelist=
# Allow optimization of some AST trees. This will activate a peephole AST
# optimizer, which will apply various small optimizations. For instance, it can
# be used to obtain the result of joining multiple strings with the addition
# operator. Joining a lot of strings can lead to a maximum recursion error in
# Pylint and this flag can prevent that. It has one side effect, the resulting
# AST will be different than the one from reality. This option is deprecated
# and it will be removed in Pylint 2.0.
optimize-ast=no
[MESSAGES CONTROL]
# Only show warnings with the listed confidence levels. Leave empty to show
# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED
confidence=
# Enable the message, report, category or checker with the given id(s). You can
# either give multiple identifier separated by comma (,) or put this option
# multiple time (only on the command line, not in the configuration file where
# it should appear only once). See also the "--disable" option for examples.
#enable=
# Disable the message, report, category or checker with the given id(s). You
# can either give multiple identifiers separated by comma (,) or put this
# option multiple times (only on the command line, not in the configuration
# file where it should appear only once).You can also use "--disable=all" to
# disable everything first and then reenable specific checks. For example, if
# you want to run only the similarities checker, you can use "--disable=all
# --enable=similarities". If you want to run only the classes checker, but have
# no Warning level messages displayed, use"--disable=all --enable=classes
# --disable=W"
disable=import-star-module-level,old-octal-literal,oct-method,print-statement,unpacking-in-except,parameter-unpacking,backtick,old-raise-syntax,old-ne-operator,long-suffix,dict-view-method,dict-iter-method,metaclass-assignment,next-method-called,raising-string,indexing-exception,raw_input-builtin,long-builtin,file-builtin,execfile-builtin,coerce-builtin,cmp-builtin,buffer-builtin,basestring-builtin,apply-builtin,filter-builtin-not-iterating,using-cmp-argument,useless-suppression,range-builtin-not-iterating,suppressed-message,no-absolute-import,old-division,cmp-method,reload-builtin,zip-builtin-not-iterating,intern-builtin,unichr-builtin,reduce-builtin,standarderror-builtin,unicode-builtin,xrange-builtin,coerce-method,delslice-method,getslice-method,setslice-method,input-builtin,round-builtin,hex-method,nonzero-method,map-builtin-not-iterating
[REPORTS]
# Set the output format. Available formats are text, parseable, colorized, msvs
# (visual studio) and html. You can also give a reporter class, eg
# mypackage.mymodule.MyReporterClass.
output-format=text
# Put messages in a separate file for each module / package specified on the
# command line instead of printing them on stdout. Reports (if any) will be
# written in a file name "pylint_global.[txt|html]". This option is deprecated
# and it will be removed in Pylint 2.0.
files-output=no
# Tells whether to display a full report or only the messages
reports=yes
# Python expression which should return a note less than 10 (10 is the highest
# note). You have access to the variables errors warning, statement which
# respectively contain the number of errors / warnings messages and the total
# number of statements analyzed. This is used by the global evaluation report
# (RP0004).
evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
# Template used to display messages. This is a python new-style format string
# used to format the message information. See doc for all details
#msg-template=
[BASIC]
# Good variable names which should always be accepted, separated by a comma
good-names=i,j,k,ex,Run,_,fh
# Bad variable names which should always be refused, separated by a comma
bad-names=foo,bar,baz,toto,tutu,tata
# Colon-delimited sets of names that determine each other's naming style when
# the name regexes allow several styles.
name-group=
# Include a hint for the correct naming format with invalid-name
include-naming-hint=no
# List of decorators that produce properties, such as abc.abstractproperty. Add
# to this list to register other decorators that produce valid properties.
property-classes=abc.abstractproperty
# Regular expression matching correct function names
function-rgx=[a-z_][a-z0-9_]{2,30}$
# Naming hint for function names
function-name-hint=[a-z_][a-z0-9_]{2,30}$
# Regular expression matching correct variable names
variable-rgx=[a-z_][a-z0-9_]{2,30}$
# Naming hint for variable names
variable-name-hint=[a-z_][a-z0-9_]{2,30}$
# Regular expression matching correct constant names
const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$
# Naming hint for constant names
const-name-hint=(([A-Z_][A-Z0-9_]*)|(__.*__))$
# Regular expression matching correct attribute names
attr-rgx=[a-z_][a-z0-9_]{2,30}$
# Naming hint for attribute names
attr-name-hint=[a-z_][a-z0-9_]{2,30}$
# Regular expression matching correct argument names
argument-rgx=[a-z_][a-z0-9_]{2,30}$
# Naming hint for argument names
argument-name-hint=[a-z_][a-z0-9_]{2,30}$
# Regular expression matching correct class attribute names
class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
# Naming hint for class attribute names
class-attribute-name-hint=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
# Regular expression matching correct inline iteration names
inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$
# Naming hint for inline iteration names
inlinevar-name-hint=[A-Za-z_][A-Za-z0-9_]*$
# Regular expression matching correct class names
class-rgx=[A-Z_][a-zA-Z0-9]+$
# Naming hint for class names
class-name-hint=[A-Z_][a-zA-Z0-9]+$
# Regular expression matching correct module names
module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
# Naming hint for module names
module-name-hint=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
# Regular expression matching correct method names
method-rgx=[a-z_][a-z0-9_]{2,30}$
# Naming hint for method names
method-name-hint=[a-z_][a-z0-9_]{2,30}$
# Regular expression which should only match function or class names that do
# not require a docstring.
no-docstring-rgx=^_
# Minimum line length for functions/classes that require docstrings, shorter
# ones are exempt.
docstring-min-length=-1
[ELIF]
# Maximum number of nested blocks for function / method body
max-nested-blocks=5
[FORMAT]
# Maximum number of characters on a single line.
max-line-length=100
# Regexp for a line that is allowed to be longer than the limit.
ignore-long-lines=^\s*(# )?<?https?://\S+>?$
# Allow the body of an if to be on the same line as the test if there is no
# else.
single-line-if-stmt=no
# List of optional constructs for which whitespace checking is disabled. `dict-
# separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}.
# `trailing-comma` allows a space between comma and closing bracket: (a, ).
# `empty-line` allows space-only lines.
no-space-check=trailing-comma,dict-separator
# Maximum number of lines in a module
max-module-lines=1000
# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
# tab).
indent-string=' '
# Number of spaces of indent required inside a hanging or continued line.
indent-after-paren=4
# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
expected-line-ending-format=
[LOGGING]
# Logging modules to check that the string format arguments are in logging
# function parameter format
logging-modules=logging
[MISCELLANEOUS]
# List of note tags to take in consideration, separated by a comma.
notes=FIXME,XXX,TODO
[SIMILARITIES]
# Minimum lines number of a similarity.
min-similarity-lines=4
# Ignore comments when computing similarities.
ignore-comments=yes
# Ignore docstrings when computing similarities.
ignore-docstrings=yes
# Ignore imports when computing similarities.
ignore-imports=no
[SPELLING]
# Spelling dictionary name. Available dictionaries: none. To make it working
# install python-enchant package.
spelling-dict=
# List of comma separated words that should not be checked.
spelling-ignore-words=
# A path to a file that contains private dictionary; one word per line.
spelling-private-dict-file=
# Tells whether to store unknown words to indicated private dictionary in
# --spelling-private-dict-file option instead of raising a message.
spelling-store-unknown-words=no
[TYPECHECK]
# Tells whether missing members accessed in mixin class should be ignored. A
# mixin class is detected if its name ends with "mixin" (case insensitive).
ignore-mixin-members=yes
# List of module names for which member attributes should not be checked
# (useful for modules/projects where namespaces are manipulated during runtime
# and thus existing member attributes cannot be deduced by static analysis. It
# supports qualified module names, as well as Unix pattern matching.
ignored-modules=
# List of class names for which member attributes should not be checked (useful
# for classes with dynamically set attributes). This supports the use of
# qualified names.
ignored-classes=optparse.Values,thread._local,_thread._local
# List of members which are set dynamically and missed by pylint inference
# system, and so shouldn't trigger E1101 when accessed. Python regular
# expressions are accepted.
generated-members=
# List of decorators that produce context managers, such as
# contextlib.contextmanager. Add to this list to register other decorators that
# produce valid context managers.
contextmanager-decorators=contextlib.contextmanager
[VARIABLES]
# Tells whether we should check for unused import in __init__ files.
init-import=no
# A regular expression matching the name of dummy variables (i.e. expectedly
# not used).
dummy-variables-rgx=(_+[a-zA-Z0-9]*?$)|dummy
# List of additional names supposed to be defined in builtins. Remember that
# you should avoid to define new builtins when possible.
additional-builtins=
# List of strings which can identify a callback function by name. A callback
# name must start or end with one of those strings.
callbacks=cb_,_cb
# List of qualified module names which can have objects that can redefine
# builtins.
redefining-builtins-modules=six.moves,future.builtins,builtins
[CLASSES]
# List of method names used to declare (i.e. assign) instance attributes.
defining-attr-methods=__init__,__new__,setUp
# List of valid names for the first argument in a class method.
valid-classmethod-first-arg=cls
# List of valid names for the first argument in a metaclass class method.
valid-metaclass-classmethod-first-arg=mcs
# List of member names, which should be excluded from the protected access
# warning.
exclude-protected=_asdict,_fields,_replace,_source,_make
[DESIGN]
# Maximum number of arguments for function / method
max-args=5
# Argument names that match this expression will be ignored. Default to name
# with leading underscore
ignored-argument-names=_.*
# Maximum number of locals for function / method body
max-locals=15
# Maximum number of return / yield for function / method body
max-returns=6
# Maximum number of branch for function / method body
max-branches=12
# Maximum number of statements in function / method body
max-statements=50
# Maximum number of parents for a class (see R0901).
max-parents=7
# Maximum number of attributes for a class (see R0902).
max-attributes=7
# Minimum number of public methods for a class (see R0903).
min-public-methods=2
# Maximum number of public methods for a class (see R0904).
max-public-methods=20
# Maximum number of boolean expressions in a if statement
max-bool-expr=5
[IMPORTS]
# Deprecated modules which should not be used, separated by a comma
deprecated-modules=regsub,TERMIOS,Bastion,rexec
# Create a graph of every (i.e. internal and external) dependencies in the
# given file (report RP0402 must not be disabled)
import-graph=
# Create a graph of external dependencies in the given file (report RP0402 must
# not be disabled)
ext-import-graph=
# Create a graph of internal dependencies in the given file (report RP0402 must
# not be disabled)
int-import-graph=
# Force import order to recognize a module as part of the standard
# compatibility libraries.
known-standard-library=
# Force import order to recognize a module as part of a third party library.
known-third-party=enchant
# Analyse import fallback blocks. This can be used to support both Python 2 and
# 3 compatible code, which means that the block might have code that exists
# only in one or another interpreter, leading to false positives when analysed.
analyse-fallback-blocks=no
[EXCEPTIONS]
# Exceptions that will emit a warning when being caught. Defaults to
# "Exception"
overgeneral-exceptions=Exception

5
.gitignore vendored
View File

@ -1,3 +1,6 @@
build
*.json
config.py
*.pyc
*.swp
*.swo
*.db

21
LICENSE.md Normal file
View File

@ -0,0 +1,21 @@
The MIT License (MIT)
Copyright (c) 2017 Phyks (Lucas Verney)
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

115
README.md Normal file
View File

@ -0,0 +1,115 @@
Flatisfy
========
Flatisfy is your new companion to ease your search of a new housing :)
It uses [Weboob](http://weboob.org/) to get all the housing posts on most of
the websites offering housings posts, and then offers a bunch of pipelines to
filter and deduplicate the fetched housings.
It can be used as a command-line utility, but also exposes a web API and
visualisation, to browse through the results.
_Note_: It is targeted at French users (due to the currently supported
websites), and in particular at people living close to Paris, as I developped
it for my personal use, and am currently living in Paris :) Any feedback and
merge requests to better support other countries / cities are more than
welcome!
_Note_: In this repository and across the code, I am using the name "flat". I
use it as a placeholder for "housing" and consider both are interchangeable.
This code is not restricted to handling flats only!
## Getting started
1. Clone the repository.
2. Install required Python modules: `pip install -r requirements.txt`.
3. Init a configuration file: `python -m flatisfy init-config > config.json`.
Edit it according to your needs (see below).
4. Build the required data files:
`python -m flatisfy build-data --config config.json`.
5. Use it to `fetch` (and output a filtered JSON list of flats) or `import`
(into an SQLite database, for the web visualization) a list of flats
matching your criteria.
6. Use `python -m flatisfy serve --config config.json` to serve the web app.
## Configuration
List of configuration options:
* `data_directory` is the directory in which you want data files to be stored.
`null` is the default value and means default `XDG` location (typically
`~/.local/share/flatisfy/`)
* `max_entries` is the maximum number of entries to fetch **per Weboob
backend** (that is per housing website).
* `passes` is the number of passes to run on the data. First pass is a basic
filtering and using only the informations from the housings list page.
Second pass loads any possible information about the filtered flats and does
better filtering.
* `queries` is a list of queries defined in `flatboob` that should be fetched.
* `database` is an SQLAlchemy URI to a database file. Defaults to `null` which
means that it will store the database in the default location, in
`data_directory`.
* `navitia_api_key` is an API token for [Navitia](https://www.navitia.io/)
which is required to compute travel times.
### Constraints
You can specify constraints, under the `constraints` key. The available
constraints are:
* `area` (in m²), `bedrooms`, `cost` (in currency unit), `rooms`: this is a
tuple of `(min, max)` values, defining an interval in which the value should
lie. A `null` value means that any value is within this bound.
* `postal_codes` is a list of allowed postal codes. You should include any
postal code you want, and especially the postal codes close to the precise
location you want. You MUST provide some postal codes.
* `time_to` is a dictionary of places to compute travel time to them.
Typically,
```
"time_to": {
"foobar": {
"gps": [LAT, LNG],
"time": [min, max]
}
}
```
means that the housings must be between the `min` and `max` bounds (possibly
`null`) from the place identified by the GPS coordinates `LAT` and `LNG`
(latitude and longitude), and we call this place `foobar` in human-readable
form. Beware that `time` constraints are in **seconds**.
## OpenData
I am using the following datasets, available under `flatisfy/data_files`,
which covers Paris. If you want to run the script using some other location,
you might have to change these files by matching datasets.
* [LaPoste Hexasmal](https://datanova.legroupe.laposte.fr/explore/dataset/laposte_hexasmal/?disjunctive.code_commune_insee&disjunctive.nom_de_la_commune&disjunctive.code_postal&disjunctive.libell_d_acheminement&disjunctive.ligne_5) for the list of cities and postal codes in France.
* [RATP stations](https://data.ratp.fr/explore/dataset/positions-geographiques-des-stations-du-reseau-ratp/table/?disjunctive.stop_name&disjunctive.code_postal&disjunctive.departement) for the list of subway stations with their positions in Paris and nearby areas.
Both datasets are licensed under the Open Data Commons Open Database License
(ODbL): https://opendatacommons.org/licenses/odbl/.
## License
The content of this repository is licensed under an MIT license, unless
explicitly mentionned otherwise.
## Thanks
* [Weboob](http://weboob.org/)
* The OpenData providers listed above!
* Navitia for their really cool public transportation API.
* A lots of Python modules, required for this script (see `requirements.txt`).
* [Kresus](https://framagit.org/bnjbvr/kresus) which gave me part of the
original idea (at least proved me such software based on scraping can
achieve a high quality level :)

130
flat.py
View File

@ -1,130 +0,0 @@
# coding: utf-8
#!/usr/bin/env python3
import json
import os
import subprocess
import sys
from fuzzywuzzy import process as fuzzyprocess
import config
def pretty_json(json_str):
return json.dumps(json_str, indent=4, separators=(',', ': '),
sort_keys=True)
def preprocess_data():
if not os.path.isdir("build"):
os.mkdir("build")
if not os.path.isfile("build/ratp.json"):
ratp_data = []
with open("data/ratp.json", "r") as fh:
ratp_data = json.load(fh)
ratp_data = sorted(
list(set(
x["fields"]["stop_name"].lower() for x in ratp_data
))
)
with open("build/ratp.json", "w") as fh:
fh.write(pretty_json(ratp_data))
def fetch_flats_list():
flats_list = []
for query in config.QUERIES:
flatboob_output = subprocess.check_output(
["flatboob", "-n", "0", "-f", "json", "load", query]
)
flats_list.extend(json.loads(flatboob_output))
return flats_list
def remove_duplicates(flats_list):
unique_flats_list = []
ids = []
for flat in flats_list:
if flat["id"] in ids:
continue
ids.append(id)
unique_flats_list.append(flat)
return unique_flats_list
def sort_by(flats_list, key="cost"):
return sorted(flats_list, key=lambda x: x["cost"])
def refine_params(flats_list):
def filter_conditions(x):
is_ok = True
if "cost" in x:
cost = x["cost"]
is_ok = (
is_ok and
(cost < config.PARAMS["max_cost"] and
cost > config.PARAMS["min_cost"])
)
if "area" in x:
area = x["area"]
is_ok = (
is_ok and
(area < config.PARAMS["max_area"] and
area > config.PARAMS["min_area"])
)
return is_ok
return filter(filter_conditions, flats_list)
def match_ratp(flats_list):
ratp_stations = []
with open("build/ratp.json", "r") as fh:
ratp_stations = json.load(fh)
for flat in flats_list:
if "station" in flat and flat["station"]:
# There is some station fetched by flatboob, try to match it
flat["ratp_station"] = fuzzyprocess.extractOne(
flat["station"], ratp_stations
)
# TODO: Cross-check station location to choose the best fit
return flats_list
def main(dumpfile=None):
if dumpfile is None:
flats_list = fetch_flats_list()
else:
with open(dumpfile, "r") as fh:
flats_list = json.load(fh)
# First pass
flats_list = remove_duplicates(flats_list)
flats_list = sort_by(flats_list, "cost")
flats_list = refine_params(flats_list)
# TODO: flats_list = match_ratp(flats_list)
# TODO: Second pass, loading additional infos for each entry
return flats_list
if __name__ == "__main__":
if len(sys.argv) > 1:
dumpfile = sys.argv[1]
else:
dumpfile = None
try:
preprocess_data()
flats_list = main(dumpfile)
print(
pretty_json(flats_list)
)
except KeyboardInterrupt:
pass

5
flatisfy/__init__.py Normal file
View File

@ -0,0 +1,5 @@
# coding: utf-8
"""
``Flatisfy`` is a tool to help you find a new housing based on some criteria.
"""
__version__ = "0.1"

176
flatisfy/__main__.py Normal file
View File

@ -0,0 +1,176 @@
# coding: utf-8
"""
Main entry point of the Flatisfy code.
"""
from __future__ import absolute_import, print_function, unicode_literals
import argparse
import logging
import sys
import flatisfy.config
from flatisfy import cmds
from flatisfy import data
from flatisfy import tools
LOGGER = logging.getLogger("flatisfy")
def parse_args(argv=None):
"""
Create parser and parse arguments.
"""
parser = argparse.ArgumentParser(prog="Flatisfy",
description="Find the perfect flat.")
# Parent parser containing arguments common to any subcommand
parent_parser = argparse.ArgumentParser(add_help=False)
parent_parser.add_argument(
"--data-dir",
help="Location of Flatisfy data directory."
)
parent_parser.add_argument(
"--config",
help="Configuration file to use."
)
parent_parser.add_argument(
"--passes", choices=[0, 1, 2], type=int,
help="Number of passes to do on the filtered data."
)
parent_parser.add_argument(
"--max-entries", type=int,
help="Maximum number of entries to fetch."
)
parent_parser.add_argument(
"-v", "--verbose", action="store_true",
help="Verbose logging output."
)
parent_parser.add_argument(
"-vv", action="store_true",
help="Debug logging output."
)
# Subcommands
subparsers = parser.add_subparsers(
dest="cmd", help="Available subcommands"
)
# Build data subcommand
subparsers.add_parser(
"build-data", parents=[parent_parser],
help="Build necessary data"
)
# Init config subcommand
parser_init_config = subparsers.add_parser(
"init-config", parents=[parent_parser],
help="Initialize empty configuration."
)
parser_init_config.add_argument(
"output", nargs="?", help="Output config file. Use '-' for stdout."
)
# Fetch subcommand parser
subparsers.add_parser("fetch", parents=[parent_parser],
help="Fetch housings posts")
# Filter subcommand parser
parser_filter = subparsers.add_parser("filter", parents=[parent_parser],
help=(
"Filter housings posts. No "
"fetching of additional infos "
"is done."))
parser_filter.add_argument(
"input",
help="JSON dump of the housings post to filter."
)
# Import subcommand parser
subparsers.add_parser("import", parents=[parent_parser],
help="Import housing posts in database.")
# Serve subcommand parser
parser_serve = subparsers.add_parser("serve", parents=[parent_parser],
help="Serve the web app.")
parser_serve.add_argument("--port", type=int, help="Port to bind to.")
parser_serve.add_argument("--host", help="Host to listen on.")
return parser.parse_args(argv)
def main():
"""
Main module code.
"""
# Parse arguments
args = parse_args()
# Set logger
if args.vv:
logging.basicConfig(level=logging.DEBUG)
logging.getLogger('sqlalchemy.engine').setLevel(logging.DEBUG)
elif args.verbose:
logging.basicConfig(level=logging.INFO)
# sqlalchemy INFO level is way too loud, just stick with WARNING
logging.getLogger('sqlalchemy.engine').setLevel(logging.WARNING)
else:
logging.basicConfig(level=logging.WARNING)
logging.getLogger('sqlalchemy.engine').setLevel(logging.WARNING)
# Init-config command
if args.cmd == "init-config":
flatisfy.config.init_config(args.output)
sys.exit(0)
else:
# Load config
config = flatisfy.config.load_config(args)
if config is None:
LOGGER.error("Invalid configuration. Exiting. "
"Run init-config before if this is the first time "
"you run Flatisfy.")
sys.exit(1)
# Build data files
try:
if args.cmd == "build-data":
data.preprocess_data(config, force=True)
sys.exit(0)
else:
data.preprocess_data(config)
except flatisfy.exceptions.DataBuildError:
sys.exit(1)
# Fetch command
if args.cmd == "fetch":
# Fetch and filter flats list
flats_list, _ = cmds.fetch_and_filter(config)
# Sort by cost
flats_list = tools.sort_list_of_dicts_by(flats_list, "cost")
print(
tools.pretty_json(flats_list)
)
# Filter command
elif args.cmd == "filter":
# Load and filter flats list
flats_list = cmds.load_and_filter(args.input, config)
# Sort by cost
flats_list = tools.sort_list_of_dicts_by(flats_list, "cost")
print(
tools.pretty_json(flats_list)
)
# Import command
elif args.cmd == "import":
cmds.import_and_filter(config)
# Serve command
elif args.cmd == "serve":
cmds.serve(config)
if __name__ == "__main__":
try:
main()
except KeyboardInterrupt:
pass

110
flatisfy/cmds.py Normal file
View File

@ -0,0 +1,110 @@
# coding: utf-8
"""
Main commands available for flatisfy.
"""
from __future__ import absolute_import, print_function, unicode_literals
import flatisfy.filters
from flatisfy import database
from flatisfy.models import flat as flat_model
from flatisfy import fetch
from flatisfy import tools
from flatisfy.web import app as web_app
def fetch_and_filter(config):
"""
Fetch the available flats list. Then, filter it according to criteria.
:param config: A config dict.
:return: A tuple of the list of all matching flats and the list of ignored
flats.
"""
# TODO: Reduce load on housings listing websites
# Fetch flats list with flatboobs
flats_list = fetch.fetch_flats_list(config)
# Do a first pass with the available infos to try to remove as much
# unwanted postings as possible
if config["passes"] > 0:
flats_list, ignored_flats = flatisfy.filters.first_pass(flats_list,
config)
# Do a second pass to consolidate all the infos we found and make use of
# additional infos
if config["passes"] > 1:
# Load additional infos
for flat in flats_list:
details = fetch.fetch_details(flat["id"])
flat = tools.merge_dicts(flat, details)
flats_list, extra_ignored_flats = flatisfy.filters.second_pass(
flats_list, config
)
ignored_flats.extend(extra_ignored_flats)
return flats_list, ignored_flats
def load_and_filter(housing_file, config):
"""
Load the dumped flats list. Then, filter it according to criteria.
:param housing_file: The JSON file to load flats from.
:param config: A config dict.
:return: A tuple of the list of all matching flats and the list of ignored
flats.
"""
# Load flats list
flats_list = fetch.load_flats_list(housing_file)
# Do a first pass with the available infos to try to remove as much
# unwanted postings as possible
if config["passes"] > 0:
flats_list, ignored_flats = flatisfy.filters.first_pass(flats_list,
config)
# Do a second pass to consolidate all the infos we found
if config["passes"] > 1:
flats_list, extra_ignored_flats = flatisfy.filters.second_pass(
flats_list, config
)
ignored_flats.extend(extra_ignored_flats)
return flats_list, ignored_flats
def import_and_filter(config):
"""
Fetch the available flats list. Then, filter it according to criteria.
Finally, store it in the database.
:param config: A config dict.
:return: ``None``.
"""
# Fetch and filter flats list
flats_list, purged_list = fetch_and_filter(config)
# Create database connection
get_session = database.init_db(config["database"])
with get_session() as session:
for flat_dict in flats_list:
flat = flat_model.Flat.from_dict(flat_dict)
session.merge(flat)
for flat_dict in purged_list:
flat = flat_model.Flat.from_dict(flat_dict)
flat.status = flat_model.FlatStatus.purged
session.merge(flat)
def serve(config):
"""
Serve the web app.
:param config: A config dict.
:return: ``None``, long-running process.
"""
app = web_app.get_app(config)
# TODO: Make Bottle use logging module
app.run(host=config["host"], port=config["port"])

208
flatisfy/config.py Normal file
View File

@ -0,0 +1,208 @@
# coding: utf-8
"""
This module handles the configuration management for Flatisfy.
It loads the default configuration, then overloads it with the provided config
file and then overloads it with command-line options.
"""
from __future__ import absolute_import, print_function, unicode_literals
from builtins import str
import json
import logging
import os
import sys
import traceback
import appdirs
from flatisfy import tools
# Default configuration
DEFAULT_CONFIG = {
# Flatboob queries to fetch
"queries": [],
# Constraints to match
"constraints": {
"postal_codes": [], # List of postal codes
"area": (None, None), # (min, max) in m^2
"cost": (None, None), # (min, max) in currency unit
"rooms": (None, None), # (min, max)
"bedrooms": (None, None), # (min, max)
"time_to": {} # Dict mapping names to {"gps": [lat, lng],
# "time": (min, max) }
# Time is in seconds
},
# Navitia API key
"navitia_api_key": None,
# Number of filtering passes to run
"passes": 2,
# Maximum number of entries to fetch
"max_entries": None,
# Directory in wich data will be put. ``None`` is XDG default location.
"data_directory": None,
# SQLAlchemy URI to the database to use
"database": None,
# Web app port
"port": 8080,
# Web app host to listen on
"host": "127.0.0.1"
}
LOGGER = logging.getLogger(__name__)
def validate_config(config):
"""
Check that the config passed as argument is a valid configuration.
:param config: A config dictionary to fetch.
:return: ``True`` if the configuration is valid, ``False`` otherwise.
"""
def _check_constraints_bounds(bounds):
"""
Check the bounds for numeric constraints.
"""
assert len(bounds) == 2
assert all(
x is None or
(
(isinstance(x, int) or isinstance(x, float)) and
x >= 0
)
for x in bounds
)
if bounds[0] is not None and bounds[1] is not None:
assert bounds[1] > bounds[0]
try:
# Note: The traceback fetching code only handle single line asserts.
# Then, we disable line-too-long pylint check and E501 flake8 checks
# and use long lines whenever needed, in order to have the full assert
# message in the log output.
# pylint: disable=line-too-long
assert "postal_codes" in config["constraints"]
assert len(config["constraints"]["postal_codes"]) > 0
assert "area" in config["constraints"]
_check_constraints_bounds(config["constraints"]["area"])
assert "cost" in config["constraints"]
_check_constraints_bounds(config["constraints"]["cost"])
assert "rooms" in config["constraints"]
_check_constraints_bounds(config["constraints"]["rooms"])
assert "bedrooms" in config["constraints"]
_check_constraints_bounds(config["constraints"]["bedrooms"])
assert "time_to" in config["constraints"]
assert isinstance(config["constraints"]["time_to"], dict)
for name, item in config["constraints"]["time_to"].items():
assert isinstance(name, str)
assert "gps" in item
assert isinstance(item["gps"], list)
assert len(item["gps"]) == 2
assert "time" in item
_check_constraints_bounds(item["time"])
assert config["passes"] in [0, 1, 2]
assert config["max_entries"] is None or (isinstance(config["max_entries"], int) and config["max_entries"] > 0) # noqa: E501
assert config["data_directory"] is None or isinstance(config["data_directory"], str) # noqa: E501
assert config["database"] is None or isinstance(config["database"], str) # noqa: E501
assert isinstance(config["port"], int)
assert isinstance(config["host"], str)
return True
except (AssertionError, KeyError):
_, _, exc_traceback = sys.exc_info()
return traceback.extract_tb(exc_traceback)[-1][-1]
def load_config(args=None):
"""
Load the configuration from file.
:param args: An argparse args structure.
:return: The loaded config dict.
"""
LOGGER.info("Initializing configuration...")
# Default configuration
config_data = DEFAULT_CONFIG.copy()
# Load config from specified JSON
if args and getattr(args, "config", None):
LOGGER.debug("Loading configuration from %s.", args.config)
try:
with open(args.config, "r") as fh:
config_data.update(json.load(fh))
except (IOError, ValueError):
LOGGER.error(
"Unable to load configuration from file, "
"using default configuration."
)
# Overload config with arguments
if args and getattr(args, "passes", None) is not None:
LOGGER.debug(
"Overloading number of passes from CLI arguments: %d.",
args.passes
)
config_data["passes"] = args.passes
if args and getattr(args, "max_entries", None) is not None:
LOGGER.debug(
"Overloading maximum number of entries from CLI arguments: %d.",
args.max_entries
)
config_data["max_entries"] = args.max_entries
if args and getattr(args, "port", None) is not None:
LOGGER.debug("Overloading web app port: %d.", args.port)
config_data["port"] = args.port
if args and getattr(args, "host", None) is not None:
LOGGER.debug("Overloading web app host: %s.", args.host)
config_data["host"] = str(args.host)
# Handle data_directory option
if args and getattr(args, "data_dir", None) is not None:
LOGGER.debug("Overloading data directory from CLI arguments.")
config_data["data_directory"] = args.data_dir
elif config_data["data_directory"] is None:
config_data["data_directory"] = appdirs.user_data_dir(
"flatisfy",
"flatisfy"
)
LOGGER.debug("Using default XDG data directory: %s.",
config_data["data_directory"])
if config_data["database"] is None:
config_data["database"] = "sqlite:///" + os.path.join(
config_data["data_directory"],
"flatisfy.db"
)
config_validation = validate_config(config_data)
if config_validation is True:
LOGGER.info("Config has been fully initialized.")
return config_data
else:
LOGGER.error("Error in configuration: %s.", config_validation)
return None
def init_config(output=None):
"""
Initialize an empty configuration file.
:param output: File to output content to. Defaults to ``stdin``.
"""
config_data = DEFAULT_CONFIG.copy()
if output and output != "-":
with open(output, "w") as fh:
fh.write(tools.pretty_json(config_data))
else:
print(tools.pretty_json(config_data))

163
flatisfy/data.py Normal file
View File

@ -0,0 +1,163 @@
# coding : utf-8
"""
This module contains all the code related to building necessary data files from
the source opendata files.
"""
from __future__ import absolute_import, print_function, unicode_literals
import collections
import json
import logging
import os
import flatisfy.exceptions
LOGGER = logging.getLogger(__name__)
MODULE_DIR = os.path.dirname(os.path.realpath(__file__))
def _preprocess_ratp(output_dir):
"""
Build RATP file from the RATP data.
:param output_dir: Directory in which the output file should reside.
:return: ``True`` on successful build, ``False`` otherwise.
"""
ratp_data_raw = []
# Load opendata file
try:
with open(os.path.join(MODULE_DIR, "data_files/ratp.json"), "r") as fh:
ratp_data_raw = json.load(fh)
except (IOError, ValueError):
LOGGER.error("Invalid raw RATP opendata file.")
return False
# Process it
ratp_data = collections.defaultdict(list)
for item in ratp_data_raw:
stop_name = item["fields"]["stop_name"].lower()
ratp_data[stop_name].append(item["fields"]["coord"])
# Output it
with open(os.path.join(output_dir, "ratp.json"), "w") as fh:
json.dump(ratp_data, fh)
return True
def _preprocess_laposte(output_dir):
"""
Build JSON files from the postal codes data.
:param output_dir: Directory in which the output file should reside.
:return: ``True`` on successful build, ``False`` otherwise.
"""
raw_laposte_data = []
# Load opendata file
try:
with open(
os.path.join(MODULE_DIR, "data_files/laposte.json"), "r"
) as fh:
raw_laposte_data = json.load(fh)
except (IOError, ValueError):
LOGGER.error("Invalid raw LaPoste opendata file.")
return False
# Build postal codes to other infos file
postal_codes_data = {}
for item in raw_laposte_data:
try:
postal_codes_data[item["fields"]["code_postal"]] = {
"gps": item["fields"]["coordonnees_gps"],
"nom": item["fields"]["nom_de_la_commune"].title()
}
except KeyError:
LOGGER.info("Missing data for postal code %s, skipping it.",
item["fields"]["code_postal"])
with open(os.path.join(output_dir, "postal_codes.json"), "w") as fh:
json.dump(postal_codes_data, fh)
# Build city name to postal codes and other infos file
cities_data = {}
for item in raw_laposte_data:
try:
cities_data[item["fields"]["nom_de_la_commune"].title()] = {
"gps": item["fields"]["coordonnees_gps"],
"postal_code": item["fields"]["code_postal"]
}
except KeyError:
LOGGER.info("Missing data for city %s, skipping it.",
item["fields"]["nom_de_la_commune"])
with open(os.path.join(output_dir, "cities.json"), "w") as fh:
json.dump(cities_data, fh)
return True
def preprocess_data(config, force=False):
"""
Ensures that all the necessary data files have been built from the raw
opendata files.
:params config: A config dictionary.
:params force: Whether to force rebuild or not.
"""
LOGGER.debug("Data directory is %s.", config["data_directory"])
opendata_directory = os.path.join(config["data_directory"], "opendata")
try:
LOGGER.info("Ensuring the data directory exists.")
os.makedirs(opendata_directory)
LOGGER.debug("Created opendata directory at %s.", opendata_directory)
except OSError:
LOGGER.debug("Opendata directory already existed, doing nothing.")
is_built_ratp = os.path.isfile(
os.path.join(opendata_directory, "ratp.json")
)
if not is_built_ratp or force:
LOGGER.info("Building from RATP data.")
if not _preprocess_ratp(opendata_directory):
raise flatisfy.exceptions.DataBuildError("Error with RATP data.")
is_built_laposte = (
os.path.isfile(os.path.join(opendata_directory, "cities.json")) and
os.path.isfile(os.path.join(opendata_directory, "postal_codes.json"))
)
if not is_built_laposte or force:
LOGGER.info("Building from LaPoste data.")
if not _preprocess_laposte(opendata_directory):
raise flatisfy.exceptions.DataBuildError(
"Error with LaPoste data."
)
def load_data(data_type, config):
"""
Load a given built data file.
:param data_type: A valid data identifier.
:param config: A config dictionary.
:return: The loaded data. ``None`` if the query is incorrect.
"""
if data_type not in ["postal_codes", "cities", "ratp"]:
LOGGER.error("Invalid request. No %s data file.", data_type)
return None
opendata_directory = os.path.join(config["data_directory"], "opendata")
datafile_path = os.path.join(opendata_directory, "%s.json" % data_type)
data = {}
try:
with open(datafile_path, "r") as fh:
data = json.load(fh)
except IOError:
LOGGER.error("No such data file: %s.", datafile_path)
return None
except ValueError:
LOGGER.error("Invalid JSON data file: %s.", datafile_path)
return None
if len(data) == 0:
LOGGER.warning("Loading empty data for %s.", data_type)
return data

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,64 @@
# coding: utf-8
"""
This module contains functions related to the database.
"""
from __future__ import absolute_import, print_function, unicode_literals
import sqlite3
from contextlib import contextmanager
from sqlalchemy import event, create_engine
from sqlalchemy.engine import Engine
from sqlalchemy.orm import sessionmaker
import flatisfy.models.flat # noqa: F401
from flatisfy.database.base import BASE
@event.listens_for(Engine, "connect")
def set_sqlite_pragma(dbapi_connection, _):
"""
Auto enable foreign keys for SQLite.
"""
# Play well with other DB backends
if isinstance(dbapi_connection, sqlite3.Connection):
cursor = dbapi_connection.cursor()
cursor.execute("PRAGMA foreign_keys=ON")
cursor.close()
def init_db(database_uri=None):
"""
Initialize the database, ensuring tables exist etc.
:param database_uri: An URI describing an engine to use. Defaults to
in-memory SQLite database.
:return: A tuple of an SQLAlchemy session maker and the created engine.
"""
if database_uri is None:
database_uri = "sqlite:///:memory:"
engine = create_engine(database_uri)
BASE.metadata.create_all(engine, checkfirst=True)
Session = sessionmaker(bind=engine) # pylint: disable=invalid-name
@contextmanager
def get_session():
"""
Provide a transactional scope around a series of operations.
From [1].
[1]: http://docs.sqlalchemy.org/en/latest/orm/session_basics.html#when-do-i-construct-a-session-when-do-i-commit-it-and-when-do-i-close-it.
"""
session = Session()
try:
yield session
session.commit()
except:
session.rollback()
raise
finally:
session.close()
return get_session

10
flatisfy/database/base.py Normal file
View File

@ -0,0 +1,10 @@
# coding: utf-8
"""
This module contains the definition of the declarative SQLAlchemy base.
"""
from __future__ import absolute_import, print_function, unicode_literals
from sqlalchemy.ext.declarative import declarative_base
BASE = declarative_base()

View File

@ -0,0 +1,48 @@
# coding: utf-8
"""
This modules implements custom types in SQLAlchemy.
"""
from __future__ import absolute_import, print_function, unicode_literals
import json
import sqlalchemy.types as types
class StringyJSON(types.TypeDecorator):
"""
Stores and retrieves JSON as TEXT for SQLite.
From
https://avacariu.me/articles/2016/compiling-json-as-text-for-sqlite-with-sqlalchemy.
.. note :: The associated field is immutable. That is, changes to the data
(typically, changing the value of a dict field) will not trigger an update
on the SQL side upon ``commit`` as the reference to the object will not
have been updated. One should force the update by forcing an update of the
reference (by performing a ``copy`` operation on the dict for instance).
"""
impl = types.TEXT
def process_bind_param(self, value, dialect):
"""
TODO
"""
if value is not None:
value = json.dumps(value)
return value
def process_result_value(self, value, dialect):
"""
TODO
"""
if value is not None:
value = json.loads(value)
return value
# TypeEngine.with_variant says "use StringyJSON instead when
# connecting to 'sqlite'"
# pylint: disable=invalid-name
MagicJSON = types.JSON().with_variant(StringyJSON, 'sqlite')

13
flatisfy/exceptions.py Normal file
View File

@ -0,0 +1,13 @@
# coding : utf-8
"""
This module contains all the exceptions definitions for the Flatisfy-specific
exceptions.
"""
from __future__ import absolute_import, print_function, unicode_literals
class DataBuildError(Exception):
"""
Error occurring on building a data file.
"""
pass

76
flatisfy/fetch.py Normal file
View File

@ -0,0 +1,76 @@
# coding: utf-8
"""
This module contains all the code related to fetching and loading flats lists.
"""
from __future__ import absolute_import, print_function, unicode_literals
import json
import logging
import subprocess
LOGGER = logging.getLogger(__name__)
def fetch_flats_list(config):
"""
Fetch the available flats using the Flatboob / Weboob config.
:param config: A config dict.
:return: A list of all available flats.
"""
flats_list = []
for query in config["queries"]:
max_entries = config["max_entries"]
if max_entries is None:
max_entries = 0
LOGGER.info("Loading flats from query %s.", query)
flatboob_output = subprocess.check_output(
["../weboob/tools/local_run.sh",