diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index af729cd..57ff4d4 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,14 +1,14 @@ -# Contributing - -We welcome contributions from anyone beginner or advanced. Please before working on some feature - -* search through the past issues, your concern may have been raised by others in the past. Check through -closed issues as well. -* if there is no open issue for your feature request please open one up to coordinate all collaborators -* write your feature -* submit a pull request on this repo with: - * a brief description - * **detail of the expected change(s) in behaviour** - * how to test it (if it's not obvious) - -Ask someone to test it. +# Contributing + +We welcome contributions from anyone beginner or advanced. Please before working on some feature + +* search through the past issues, your concern may have been raised by others in the past. Check through +closed issues as well. +* if there is no open issue for your feature request please open one up to coordinate all collaborators +* write your feature +* submit a pull request on this repo with: + * a brief description + * **detail of the expected change(s) in behaviour** + * how to test it (if it's not obvious) + +Ask someone to test it. diff --git a/LICENSE b/LICENSE index 261eeb9..29f81d8 100644 --- a/LICENSE +++ b/LICENSE @@ -1,201 +1,201 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md index 9bb6c46..31463b1 100644 --- a/README.md +++ b/README.md @@ -1,159 +1,159 @@ -# featurewiz - -![banner](featurewiz_logo.jpg) - -Featurewiz is a new python library for selecting the best features in your data set fast! -(featurewiz logo created using Wix) -

Two methods are used in this version of featurewiz:
- -1. SULOV -> SULOV means Searching for Uncorrelated List of Variables. The SULOV method is explained in this chart below. SULOV stands for: “Searching for Uncorrelated List Of Variables” - -Here is a simple way of explaining how it works: -

    -
  1. Find all the pairs of highly correlated variables exceeding a correlation threshold (say absolute(0.7)). -
  2. Then find their MIS score (Mutual Information Score) to the target variable. MIS is a non-parametric scoring method. So its suitable for all kinds of variables and target. -
  3. Now take each pair of correlated variables, then knock off the one with the lower MIS score. -
  4. What’s left is the ones with the highest Information scores and least correlation with each other. -
- - -![sulov](SULOV.jpg) - -2. Recursive XGBoost: Once SULOV has selected variables that have high mutual information scores with least less correlation amongst them, we use XGBoost to repeatedly find best features among the remaining variables after SULOV. The Recursive XGBoost method is explained in this chart below. -Once have done SULOV method, now select the best variables using XGBoost feature important but apply it recursively to smaller and smaller sets of data in your data set. This is how it works: -
    -
  1. Select all variables in data set and the full data split into train and valid sets. -
  2. Find top X features (could be 10) on train using valid for early stopping (to prevent over-fitting) -
  3. Then take next set of vars and find top X -
  4. Do this 5 times. Combine all selected features and de-duplicate them. -
- - -![xgboost](xgboost.jpg) - -3. Classification of variables by type: It automatically detects the different types of variables in your data set and converts them to numeric except date-time, NLP and large-text variables. These variables must be properly encoded and transformed (or embedded) into numeric form by you if you want them included in featurewiz selection.
- -4. Best step after feature engineering: Featurewiz represents the next best step you can perform after doing some feature engineering on your own since you might have added some highly correlated or even wasteful features when you use some automated tools such as featuretools to perform feature engineering. With featurewiz as the last step before you do modeling, you can perform feature selection with featurewiz and the best and least number of features before doing more expensive training and inference. - -

To upgrade to the best, most stable and full-featured version always do the following:
-Use $ pip install featurewiz --upgrade --ignore-installed
-or -pip install git+https://github.com/AutoViML/featurewiz.git
- -## Table of Contents -

- -## Background - -To learn more about how featurewiz works under the hood, watch this [video](https://www.youtube.com/embed/ZiNutwPcAU0)
- -

featurewiz was designed for selecting High Performance variables with the fewest steps. - -In most cases, featurewiz builds models with 20%-99% fewer features than your original data set with nearly the same or slightly lower performance (this is based on my trials. Your experience may vary).
-

-featurewiz is every Data Scientist's feature wizard that will:

    -
  1. Automatically pre-process data: you can send in your entire dataframe as is and featurewiz will classify and change/label encode categorical variables changes to help XGBoost processing. That way, you don't have to preprocess your data before using featurewiz
    -
  2. Assist you with variable classification: featurewiz classifies variables automatically. This is very helpful when you have hundreds if not thousands of variables since it can readily identify which of those are numeric vs categorical vs NLP text vs date-time variables and so on.
    -
  3. Perform feature reduction automatically. When you have small data sets and you know your domain well, it is easy to perhaps do EDA and identify which variables are important. But when you have a very large data set with hundreds if not thousands of variables, selecting the best features from your model can mean the difference between a bloated and highly complex model or a simple model with the fewest and most information-rich features. featurewiz uses XGBoost repeatedly to perform feature selection. You must try it on your large data sets and compare!
    -
  4. Explain SULOV method graphically using networkx library so you can see which variables are highly correlated to which ones and which of those have high or low mutual information scores automatically. Just set verbose = 2 to see the graph.
    -
-featurewiz is built using xgboost, numpy, pandas and matplotlib. It should run on most Python 3 Anaconda installations. You won't have to import any special -libraries other than "XGBoost" and "networkx" library. We use "networkx" library for interpretability.
But if you don't have these libraries, featurewiz will install those for you automatically. - -## Install - -**Prerequsites:** - -- [Anaconda](https://docs.anaconda.com/anaconda/install/) - -To clone featurewiz, it is better to create a new environment, and install the required dependencies: - -To install from PyPi: - -``` -conda create -n python=3.7 anaconda -conda activate # ON WINDOWS: `source activate ` -pip install featurewiz -or -pip install git+https://github.com/AutoViML/featurewiz.git -``` - -To install from source: - -``` -cd -git clone git@github.com:AutoViML/featurewiz.git -# or download and unzip https://github.com/AutoViML/featurewiz/archive/master.zip -conda create -n python=3.7 anaconda -conda activate # ON WINDOWS: `source activate ` -cd featurewiz -pip install -r requirements.txt -``` - -## Usage - -In the same directory, open a Jupyter Notebook and use this line to import the .py file: - -``` -from featurewiz import featurewiz -``` - -Load a data set (any CSV or text file) into a Pandas dataframe and give it the name of the target(s) variable. If you have more than one target, it will handle multi-label targets too. Just give it a list of variables in that case. If you don't have a dataframe, you can simply enter the name and path of the file to load into featurewiz: - -``` -features = featurewiz( - dataname, - target, - corr_limit=0.7, - verbose=2, - sep=",", - header=0) -``` - -Finally, it returns the list of variables selected. - -This list of selected features is ready for you to now to do further modeling. - -featurewiz works on any Multi-Class, Multi-Label Data Set. So you can have as many target labels as you want. -You don't have to tell featurwiz whether it is a Regression or Classification problem. It will decide that automatically. - -## API - -**Arguments** - -- `dataname`: could be a datapath+filename or a dataframe. It will detect whether your input is a filename or a dataframe and load it automatically. -- `target`: name of the target variable in the data set. -- `corr_limit`: if you want to set your own threshold for removing variables as highly correlated, then give it here. The default is 0.7 which means variables less than -0.7 and greater than 0.7 in pearson's correlation will be candidates for removal. -- `verbose`: This has 3 possible states: - - `0` limited output. Great for running this silently and getting fast results. - - `1` more verbiage. Great for knowing how results were and making changes to flags in input. - - `2` SULOV charts and output. Great for finding out what happens under the hood for SULOV method. - -**Return values** - -- `features`: the fewest number of features in your model to make it perform well - -## Maintainers - -* [@AutoViML](https://github.com/AutoViML) - -## Contributing - -See [the contributing file](CONTRIBUTING.md)! - -PRs accepted. - -## License - -Apache License 2.0 © 2020 Ram Seshadri - -## DISCLAIMER -This project is not an official Google project. It is not supported by Google and Google specifically disclaims all warranties as to its quality, merchantability, or fitness for a particular purpose. +# featurewiz + +![banner](featurewiz_logo.jpg) + +Featurewiz is a new python library for selecting the best features in your data set fast! +(featurewiz logo created using Wix) +

Two methods are used in this version of featurewiz:
+ +1. SULOV -> SULOV means Searching for Uncorrelated List of Variables. The SULOV method is explained in this chart below. SULOV stands for: “Searching for Uncorrelated List Of Variables” + +Here is a simple way of explaining how it works: +

    +
  1. Find all the pairs of highly correlated variables exceeding a correlation threshold (say absolute(0.7)). +
  2. Then find their MIS score (Mutual Information Score) to the target variable. MIS is a non-parametric scoring method. So its suitable for all kinds of variables and target. +
  3. Now take each pair of correlated variables, then knock off the one with the lower MIS score. +
  4. What’s left is the ones with the highest Information scores and least correlation with each other. +
+ + +![sulov](SULOV.jpg) + +2. Recursive XGBoost: Once SULOV has selected variables that have high mutual information scores with least less correlation amongst them, we use XGBoost to repeatedly find best features among the remaining variables after SULOV. The Recursive XGBoost method is explained in this chart below. +Once have done SULOV method, now select the best variables using XGBoost feature important but apply it recursively to smaller and smaller sets of data in your data set. This is how it works: +
    +
  1. Select all variables in data set and the full data split into train and valid sets. +
  2. Find top X features (could be 10) on train using valid for early stopping (to prevent over-fitting) +
  3. Then take next set of vars and find top X +
  4. Do this 5 times. Combine all selected features and de-duplicate them. +
+ + +![xgboost](xgboost.jpg) + +3. Classification of variables by type: It automatically detects the different types of variables in your data set and converts them to numeric except date-time, NLP and large-text variables. These variables must be properly encoded and transformed (or embedded) into numeric form by you if you want them included in featurewiz selection.
+ +4. Best step after feature engineering: Featurewiz represents the next best step you can perform after doing some feature engineering on your own since you might have added some highly correlated or even wasteful features when you use some automated tools such as featuretools to perform feature engineering. With featurewiz as the last step before you do modeling, you can perform feature selection with featurewiz and the best and least number of features before doing more expensive training and inference. + +

To upgrade to the best, most stable and full-featured version always do the following:
+Use $ pip install featurewiz --upgrade --ignore-installed
+or +pip install git+https://github.com/AutoViML/featurewiz.git
+ +## Table of Contents +

+ +## Background + +To learn more about how featurewiz works under the hood, watch this [video](https://www.youtube.com/embed/ZiNutwPcAU0)
+ +

featurewiz was designed for selecting High Performance variables with the fewest steps. + +In most cases, featurewiz builds models with 20%-99% fewer features than your original data set with nearly the same or slightly lower performance (this is based on my trials. Your experience may vary).
+

+featurewiz is every Data Scientist's feature wizard that will:

    +
  1. Automatically pre-process data: you can send in your entire dataframe as is and featurewiz will classify and change/label encode categorical variables changes to help XGBoost processing. That way, you don't have to preprocess your data before using featurewiz
    +
  2. Assist you with variable classification: featurewiz classifies variables automatically. This is very helpful when you have hundreds if not thousands of variables since it can readily identify which of those are numeric vs categorical vs NLP text vs date-time variables and so on.
    +
  3. Perform feature reduction automatically. When you have small data sets and you know your domain well, it is easy to perhaps do EDA and identify which variables are important. But when you have a very large data set with hundreds if not thousands of variables, selecting the best features from your model can mean the difference between a bloated and highly complex model or a simple model with the fewest and most information-rich features. featurewiz uses XGBoost repeatedly to perform feature selection. You must try it on your large data sets and compare!
    +
  4. Explain SULOV method graphically using networkx library so you can see which variables are highly correlated to which ones and which of those have high or low mutual information scores automatically. Just set verbose = 2 to see the graph.
    +
+featurewiz is built using xgboost, numpy, pandas and matplotlib. It should run on most Python 3 Anaconda installations. You won't have to import any special +libraries other than "XGBoost" and "networkx" library. We use "networkx" library for interpretability.
But if you don't have these libraries, featurewiz will install those for you automatically. + +## Install + +**Prerequsites:** + +- [Anaconda](https://docs.anaconda.com/anaconda/install/) + +To clone featurewiz, it is better to create a new environment, and install the required dependencies: + +To install from PyPi: + +``` +conda create -n python=3.7 anaconda +conda activate # ON WINDOWS: `source activate ` +pip install featurewiz +or +pip install git+https://github.com/AutoViML/featurewiz.git +``` + +To install from source: + +``` +cd +git clone git@github.com:AutoViML/featurewiz.git +# or download and unzip https://github.com/AutoViML/featurewiz/archive/master.zip +conda create -n python=3.7 anaconda +conda activate # ON WINDOWS: `source activate ` +cd featurewiz +pip install -r requirements.txt +``` + +## Usage + +In the same directory, open a Jupyter Notebook and use this line to import the .py file: + +``` +from featurewiz import featurewiz +``` + +Load a data set (any CSV or text file) into a Pandas dataframe and give it the name of the target(s) variable. If you have more than one target, it will handle multi-label targets too. Just give it a list of variables in that case. If you don't have a dataframe, you can simply enter the name and path of the file to load into featurewiz: + +``` +features = featurewiz( + dataname, + target, + corr_limit=0.7, + verbose=2, + sep=",", + header=0) +``` + +Finally, it returns the list of variables selected. + +This list of selected features is ready for you to now to do further modeling. + +featurewiz works on any Multi-Class, Multi-Label Data Set. So you can have as many target labels as you want. +You don't have to tell featurwiz whether it is a Regression or Classification problem. It will decide that automatically. + +## API + +**Arguments** + +- `dataname`: could be a datapath+filename or a dataframe. It will detect whether your input is a filename or a dataframe and load it automatically. +- `target`: name of the target variable in the data set. +- `corr_limit`: if you want to set your own threshold for removing variables as highly correlated, then give it here. The default is 0.7 which means variables less than -0.7 and greater than 0.7 in pearson's correlation will be candidates for removal. +- `verbose`: This has 3 possible states: + - `0` limited output. Great for running this silently and getting fast results. + - `1` more verbiage. Great for knowing how results were and making changes to flags in input. + - `2` SULOV charts and output. Great for finding out what happens under the hood for SULOV method. + +**Return values** + +- `features`: the fewest number of features in your model to make it perform well + +## Maintainers + +* [@AutoViML](https://github.com/AutoViML) + +## Contributing + +See [the contributing file](CONTRIBUTING.md)! + +PRs accepted. + +## License + +Apache License 2.0 © 2020 Ram Seshadri + +## DISCLAIMER +This project is not an official Google project. It is not supported by Google and Google specifically disclaims all warranties as to its quality, merchantability, or fitness for a particular purpose. diff --git a/build/lib/featurewiz/__init__.py b/build/lib/featurewiz/__init__.py deleted file mode 100644 index 0674ce6..0000000 --- a/build/lib/featurewiz/__init__.py +++ /dev/null @@ -1,22 +0,0 @@ -# -*- coding: utf-8 -*- -################################################################################ -# featurewiz - fast feature selection using one line of code -# Python v3.6+ -# Created by Ram Seshadri -# Licensed under Apache License v2 -################################################################################ -# Version -from .__version__ import __version__ -from .featurewiz import featurewiz - -if __name__ == "__main__": - version_number = __version__ - print("""Running featurewiz version: %s. Call using: - features = featurewiz(dataname, target, corr_limit=0.70, - verbose=2)""" %version_number) -else: - version_number = __version__ - print("""Imported featurewiz version: %s. Call using: - features = featurewiz(dataname, target, corr_limit=0.70, - verbose=2)""" %version_number) -################################################################################ diff --git a/build/lib/featurewiz/__version__.py b/build/lib/featurewiz/__version__.py deleted file mode 100644 index 3f5d8a6..0000000 --- a/build/lib/featurewiz/__version__.py +++ /dev/null @@ -1,10 +0,0 @@ -# -*- coding: utf-8 -*- -"""Specifies the version of the FeatureWiz package.""" - -__title__ = "featurewiz" -__author__ = "Ram Seshadri" -__description__ = "Fast Feature Selection for any data set, any size" -__url__ = "https://github.com/Auto_ViML/featurewiz.git" -__version__ = "0.0.6" -__license__ = "Apache License 2.0" -__copyright__ = "2020 Google" diff --git a/build/lib/featurewiz/featurewiz.py b/build/lib/featurewiz/featurewiz.py deleted file mode 100644 index a7d6f94..0000000 --- a/build/lib/featurewiz/featurewiz.py +++ /dev/null @@ -1,1079 +0,0 @@ -############################################################################## -#Copyright 2019 Google LLC -# -#Licensed under the Apache License, Version 2.0 (the "License"); -#you may not use this file except in compliance with the License. -#You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -#Unless required by applicable law or agreed to in writing, software -#distributed under the License is distributed on an "AS IS" BASIS, -#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -#See the License for the specific language governing permissions and -#limitations under the License. -################################################################################# -#### C O D E C A N B E RE-U S E D W I T H C I T A T I O N B E L O W #### -################################################################################# -############### F E A T U R E W I Z ############### -################ featurewiz library developed by Ram Seshadri ################# -#### THIS METHOD IS KNOWN AS SULOV METHOD in HONOR OF my mom, SULOCHANA ######### -##### SULOV means Searching for Uncorrelated List Of Variables ########### -############### v 0.0.1 ################ -############### A L L R I G H T S R E S E R V E D ################ -################################################################################# -##### This project is not an official Google project. It is not supported by #### -##### Google and Google specifically disclaims all warranties as to its quality,# -##### merchantability, or fitness for a particular purpose. #################### -################################################################################# -import pandas as pd -import numpy as np -from sklearn.model_selection import KFold -from sklearn.model_selection import GridSearchCV -from sklearn.multioutput import MultiOutputClassifier, MultiOutputRegressor -from sklearn.multiclass import OneVsRestClassifier -import xgboost as xgb -from xgboost.sklearn import XGBClassifier -from xgboost.sklearn import XGBRegressor -from sklearn.model_selection import train_test_split -################################################################################ -#### The warnings from Sklearn are so annoying that I have to shut it off ####### -import warnings -warnings.filterwarnings("ignore") -from sklearn.exceptions import DataConversionWarning -warnings.filterwarnings(action='ignore', category=DataConversionWarning) -def warn(*args, **kwargs): - pass -warnings.warn = warn -#################################################################################### -import re -import pdb -import pprint -from itertools import cycle, combinations -from collections import defaultdict, OrderedDict -import copy -import time -import sys -import random -import xlrd -import statsmodels -from io import BytesIO -import base64 -from functools import reduce -import copy -####################################################################################################### -def classify_features(dfte, depVar, verbose=0): - dfte = copy.deepcopy(dfte) - if isinstance(depVar, list): - orig_preds = [x for x in list(dfte) if x not in depVar] - else: - orig_preds = [x for x in list(dfte) if x not in [depVar]] - ################# CLASSIFY COLUMNS HERE ###################### - var_df = classify_columns(dfte[orig_preds], verbose) - ##### Classify Columns ################ - IDcols = var_df['id_vars'] - discrete_string_vars = var_df['nlp_vars']+var_df['discrete_string_vars'] - cols_delete = var_df['cols_delete'] - bool_vars = var_df['string_bool_vars'] + var_df['num_bool_vars'] - int_vars = var_df['int_vars'] - categorical_vars = var_df['cat_vars'] + var_df['factor_vars'] + int_vars + bool_vars - date_vars = var_df['date_vars'] - if len(var_df['continuous_vars'])==0 and len(int_vars)>0: - continuous_vars = var_df['int_vars'] - categorical_vars = left_subtract(categorical_vars, int_vars) - int_vars = [] - else: - continuous_vars = var_df['continuous_vars'] - preds = [x for x in orig_preds if x not in IDcols+cols_delete+discrete_string_vars] - if len(IDcols+cols_delete+discrete_string_vars) == 0: - print(' No variables removed since no ID or low-information variables found in data set') - else: - print(' %d variables removed since they were ID or low-information variables' - %len(IDcols+cols_delete+discrete_string_vars)) - if verbose >= 1: - print(' List of variables removed: %s' %(IDcols+cols_delete+discrete_string_vars)) - ############# Check if there are too many columns to visualize ################ - ppt = pprint.PrettyPrinter(indent=4) - if verbose==1 and len(cols_list) <= max_cols_analyzed: - marthas_columns(dft,verbose) - print(" Columns to delete:") - ppt.pprint(' %s' % cols_delete) - print(" Boolean variables %s ") - ppt.pprint(' %s' % bool_vars) - print(" Categorical variables %s ") - ppt.pprint(' %s' % categorical_vars) - print(" Continuous variables %s " ) - ppt.pprint(' %s' % continuous_vars) - print(" Discrete string variables %s " ) - ppt.pprint(' %s' % discrete_string_vars) - print(" Date and time variables %s " ) - ppt.pprint(' %s' % date_vars) - print(" ID variables %s ") - ppt.pprint(' %s' % IDcols) - print(" Target variable %s ") - ppt.pprint(' %s' % depVar) - elif verbose==1 and len(cols_list) > max_cols_analyzed: - print(' Total columns > %d, too numerous to list.' %max_cols_analyzed) - features_dict = dict([('IDcols',IDcols),('cols_delete',cols_delete),('bool_vars',bool_vars),('categorical_vars',categorical_vars), - ('continuous_vars',continuous_vars),('discrete_string_vars',discrete_string_vars), - ('date_vars',date_vars)]) - return features_dict -####################################################################################################### -def marthas_columns(data,verbose=0): - """ - This program is named in honor of my one of students who came up with the idea for it. - It's a neat way of printing data types and information compared to the boring describe() function in Pandas. - """ - data = data[:] - print('Data Set Shape: %d rows, %d cols' % data.shape) - if data.shape[1] > 30: - print('Too many columns to print') - else: - if verbose==1: - print('Data Set columns info:') - for col in data.columns: - print('* %s: %d nulls, %d unique vals, most common: %s' % ( - col, - data[col].isnull().sum(), - data[col].nunique(), - data[col].value_counts().head(2).to_dict() - )) - print('--------------------------------------------------------------------') -################################################################################ -######### NEW And FAST WAY to CLASSIFY COLUMNS IN A DATA SET ####### -################################################################################ -def classify_columns(df_preds, verbose=0): - """ - Takes a dataframe containing only predictors to be classified into various types. - DO NOT SEND IN A TARGET COLUMN since it will try to include that into various columns. - Returns a data frame containing columns and the class it belongs to such as numeric, - categorical, date or id column, boolean, nlp, discrete_string and cols to delete... - ####### Returns a dictionary with 10 kinds of vars like the following: # continuous_vars,int_vars - # cat_vars,factor_vars, bool_vars,discrete_string_vars,nlp_vars,date_vars,id_vars,cols_delete - """ - train = copy.deepcopy(df_preds) - #### If there are 30 chars are more in a discrete_string_var, it is then considered an NLP variable - max_nlp_char_size = 30 - max_cols_to_print = 30 - print('############## C L A S S I F Y I N G V A R I A B L E S ####################') - print('Classifying variables in data set...') - #### Cat_Limit defines the max number of categories a column can have to be called a categorical colum - cat_limit = 35 - float_limit = 15 #### Make this limit low so that float variables below this limit become cat vars ### - def add(a,b): - return a+b - sum_all_cols = dict() - orig_cols_total = train.shape[1] - #Types of columns - cols_delete = [col for col in list(train) if (len(train[col].value_counts()) == 1 - ) | (train[col].isnull().sum()/len(train) >= 0.90)] - train = train[left_subtract(list(train),cols_delete)] - var_df = pd.Series(dict(train.dtypes)).reset_index(drop=False).rename( - columns={0:'type_of_column'}) - sum_all_cols['cols_delete'] = cols_delete - var_df['bool'] = var_df.apply(lambda x: 1 if x['type_of_column'] in ['bool','object'] - and len(train[x['index']].value_counts()) == 2 else 0, axis=1) - string_bool_vars = list(var_df[(var_df['bool'] ==1)]['index']) - sum_all_cols['string_bool_vars'] = string_bool_vars - var_df['num_bool'] = var_df.apply(lambda x: 1 if x['type_of_column'] in [np.uint8, - np.uint16, np.uint32, np.uint64, - 'int8','int16','int32','int64', - 'float16','float32','float64'] and len( - train[x['index']].value_counts()) == 2 else 0, axis=1) - num_bool_vars = list(var_df[(var_df['num_bool'] ==1)]['index']) - sum_all_cols['num_bool_vars'] = num_bool_vars - ###### This is where we take all Object vars and split them into diff kinds ### - discrete_or_nlp = var_df.apply(lambda x: 1 if x['type_of_column'] in ['object'] and x[ - 'index'] not in string_bool_vars+cols_delete else 0,axis=1) - ######### This is where we figure out whether a string var is nlp or discrete_string var ### - var_df['nlp_strings'] = 0 - var_df['discrete_strings'] = 0 - var_df['cat'] = 0 - var_df['id_col'] = 0 - discrete_or_nlp_vars = var_df.loc[discrete_or_nlp==1]['index'].values.tolist() - if len(var_df.loc[discrete_or_nlp==1]) != 0: - for col in discrete_or_nlp_vars: - #### first fill empty or missing vals since it will blowup ### - train[col] = train[col].fillna(' ') - if train[col].map(lambda x: len(x) if type(x)==str else 0).mean( - ) >= max_nlp_char_size and len(train[col].value_counts() - ) <= int(0.9*len(train)) and col not in string_bool_vars: - var_df.loc[var_df['index']==col,'nlp_strings'] = 1 - elif len(train[col].value_counts()) > cat_limit and len(train[col].value_counts() - ) <= int(0.9*len(train)) and col not in string_bool_vars: - var_df.loc[var_df['index']==col,'discrete_strings'] = 1 - elif len(train[col].value_counts()) > cat_limit and len(train[col].value_counts() - ) == len(train) and col not in string_bool_vars: - var_df.loc[var_df['index']==col,'id_col'] = 1 - else: - var_df.loc[var_df['index']==col,'cat'] = 1 - nlp_vars = list(var_df[(var_df['nlp_strings'] ==1)]['index']) - sum_all_cols['nlp_vars'] = nlp_vars - discrete_string_vars = list(var_df[(var_df['discrete_strings'] ==1) ]['index']) - sum_all_cols['discrete_string_vars'] = discrete_string_vars - ###### This happens only if a string column happens to be an ID column ####### - #### DO NOT Add this to ID_VARS yet. It will be done later.. Dont change it easily... - #### Category DTYPE vars are very special = they can be left as is and not disturbed in Python. ### - var_df['dcat'] = var_df.apply(lambda x: 1 if str(x['type_of_column'])=='category' else 0, - axis=1) - factor_vars = list(var_df[(var_df['dcat'] ==1)]['index']) - sum_all_cols['factor_vars'] = factor_vars - ######################################################################## - date_or_id = var_df.apply(lambda x: 1 if x['type_of_column'] in [np.uint8, - np.uint16, np.uint32, np.uint64, - 'int8','int16', - 'int32','int64'] and x[ - 'index'] not in string_bool_vars+num_bool_vars+discrete_string_vars+nlp_vars else 0, - axis=1) - ######### This is where we figure out whether a numeric col is date or id variable ### - var_df['int'] = 0 - var_df['date_time'] = 0 - ### if a particular column is date-time type, now set it as a date time variable ## - var_df['date_time'] = var_df.apply(lambda x: 1 if x['type_of_column'] in [' 2050: - var_df.loc[var_df['index']==col,'id_col'] = 1 - else: - try: - pd.to_datetime(train[col],infer_datetime_format=True) - var_df.loc[var_df['index']==col,'date_time'] = 1 - except: - var_df.loc[var_df['index']==col,'id_col'] = 1 - else: - if train[col].min() < 1900 or train[col].max() > 2050: - if col not in num_bool_vars: - var_df.loc[var_df['index']==col,'int'] = 1 - else: - try: - pd.to_datetime(train[col],infer_datetime_format=True) - var_df.loc[var_df['index']==col,'date_time'] = 1 - except: - if col not in num_bool_vars: - var_df.loc[var_df['index']==col,'int'] = 1 - else: - pass - int_vars = list(var_df[(var_df['int'] ==1)]['index']) - date_vars = list(var_df[(var_df['date_time'] == 1)]['index']) - id_vars = list(var_df[(var_df['id_col'] == 1)]['index']) - sum_all_cols['int_vars'] = int_vars - copy_date_vars = copy.deepcopy(date_vars) - for date_var in copy_date_vars: - #### This test is to make sure sure date vars are actually date vars - try: - pd.to_datetime(train[date_var],infer_datetime_format=True) - except: - ##### if not a date var, then just add it to delete it from processing - cols_delete.append(date_var) - date_vars.remove(date_var) - sum_all_cols['date_vars'] = date_vars - sum_all_cols['id_vars'] = id_vars - sum_all_cols['cols_delete'] = cols_delete - ## This is an EXTREMELY complicated logic for cat vars. Don't change it unless you test it many times! - var_df['numeric'] = 0 - float_or_cat = var_df.apply(lambda x: 1 if x['type_of_column'] in ['float16', - 'float32','float64'] else 0, - axis=1) - if len(var_df.loc[float_or_cat == 1]) > 0: - for col in var_df.loc[float_or_cat == 1]['index'].values.tolist(): - if len(train[col].value_counts()) > 2 and len(train[col].value_counts() - ) <= float_limit and len(train[col].value_counts()) <= len(train): - var_df.loc[var_df['index']==col,'cat'] = 1 - else: - if col not in num_bool_vars: - var_df.loc[var_df['index']==col,'numeric'] = 1 - cat_vars = list(var_df[(var_df['cat'] ==1)]['index']) - continuous_vars = list(var_df[(var_df['numeric'] ==1)]['index']) - ######## V E R Y I M P O R T A N T ################################################### - ##### There are a couple of extra tests you need to do to remove abberations in cat_vars ### - cat_vars_copy = copy.deepcopy(cat_vars) - for cat in cat_vars_copy: - if df_preds[cat].dtype==float: - continuous_vars.append(cat) - cat_vars.remove(cat) - var_df.loc[var_df['index']==cat,'cat'] = 0 - var_df.loc[var_df['index']==cat,'numeric'] = 1 - elif len(df_preds[cat].value_counts()) == df_preds.shape[0]: - id_vars.append(cat) - cat_vars.remove(cat) - var_df.loc[var_df['index']==cat,'cat'] = 0 - var_df.loc[var_df['index']==cat,'id_col'] = 1 - sum_all_cols['cat_vars'] = cat_vars - sum_all_cols['continuous_vars'] = continuous_vars - sum_all_cols['id_vars'] = id_vars - ###### This is where you consoldate the numbers ########### - var_dict_sum = dict(zip(var_df.values[:,0], var_df.values[:,2:].sum(1))) - for col, sumval in var_dict_sum.items(): - if sumval == 0: - print('%s of type=%s is not classified' %(col,train[col].dtype)) - elif sumval > 1: - print('%s of type=%s is classified into more then one type' %(col,train[col].dtype)) - else: - pass - ############### This is where you print all the types of variables ############## - ####### Returns 8 vars in the following order: continuous_vars,int_vars,cat_vars, - ### string_bool_vars,discrete_string_vars,nlp_vars,date_or_id_vars,cols_delete - if verbose == 1: - print(" Number of Numeric Columns = ", len(continuous_vars)) - print(" Number of Integer-Categorical Columns = ", len(int_vars)) - print(" Number of String-Categorical Columns = ", len(cat_vars)) - print(" Number of Factor-Categorical Columns = ", len(factor_vars)) - print(" Number of String-Boolean Columns = ", len(string_bool_vars)) - print(" Number of Numeric-Boolean Columns = ", len(num_bool_vars)) - print(" Number of Discrete String Columns = ", len(discrete_string_vars)) - print(" Number of NLP String Columns = ", len(nlp_vars)) - print(" Number of Date Time Columns = ", len(date_vars)) - print(" Number of ID Columns = ", len(id_vars)) - print(" Number of Columns to Delete = ", len(cols_delete)) - if verbose == 2: - marthas_columns(df_preds,verbose=1) - print(" Numeric Columns: %s" %continuous_vars[:max_cols_to_print]) - print(" Integer-Categorical Columns: %s" %int_vars[:max_cols_to_print]) - print(" String-Categorical Columns: %s" %cat_vars[:max_cols_to_print]) - print(" Factor-Categorical Columns: %s" %factor_vars[:max_cols_to_print]) - print(" String-Boolean Columns: %s" %string_bool_vars[:max_cols_to_print]) - print(" Numeric-Boolean Columns: %s" %num_bool_vars[:max_cols_to_print]) - print(" Discrete String Columns: %s" %discrete_string_vars[:max_cols_to_print]) - print(" NLP text Columns: %s" %nlp_vars[:max_cols_to_print]) - print(" Date Time Columns: %s" %date_vars[:max_cols_to_print]) - print(" ID Columns: %s" %id_vars[:max_cols_to_print]) - print(" Columns that will not be considered in modeling: %s" %cols_delete[:max_cols_to_print]) - ##### now collect all the column types and column names into a single dictionary to return! - len_sum_all_cols = reduce(add,[len(v) for v in sum_all_cols.values()]) - if len_sum_all_cols == orig_cols_total: - print(' %d Predictors classified...' %orig_cols_total) - print(' This does not include the Target column(s)') - else: - print('No of columns classified %d does not match %d total cols. Continuing...' %( - len_sum_all_cols, orig_cols_total)) - ls = sum_all_cols.values() - flat_list = [item for sublist in ls for item in sublist] - if len(left_subtract(list(train),flat_list)) == 0: - print(' Missing columns = None') - else: - print(' Missing columns = %s' %left_subtract(list(train),flat_list)) - return sum_all_cols -################################################################################# -from collections import Counter -import time -from sklearn.feature_selection import chi2, mutual_info_regression, mutual_info_classif -from sklearn.feature_selection import SelectKBest -################################################################################## -def load_file_dataframe(dataname, sep=",", header=0, verbose=0): - start_time = time.time() - ########################### This is where we load file or data frame ############### - if isinstance(dataname,str): - #### this means they have given file name as a string to load the file ##### - if dataname != '' and dataname.endswith(('csv')): - codex = ['utf-8', 'iso-8859-1', 'cp1252', 'latin1'] - for code in codex: - try: - dfte = pd.read_csv(dataname,sep=sep,index_col=None,encoding=code) - print('Encoder %s chosen to read CSV file' %code) - print('Shape of your Data Set loaded: %s' %(dfte.shape,)) - return dfte - except: - print('Encoding codex %s does not work for this file' %code) - continue - elif dataname.endswith(('xlsx','xls','txt')): - #### It's very important to get header rows in Excel since people put headers anywhere in Excel# - dfte = pd.read_excel(dataname,header=header) - print('Shape of your Data Set loaded: %s' %(dfte.shape,)) - return dfte - else: - print('File not able to be loaded') - return - if isinstance(dataname,pd.DataFrame): - #### this means they have given a dataframe name to use directly in processing ##### - dfte = copy.deepcopy(dataname) - return dfte - else: - print('Dataname input must be a filename with path to that file or a Dataframe') - return -################################################################################## -# Removes duplicates from a list to return unique values - USED ONLYONCE -def find_remove_duplicates(values): - output = [] - seen = set() - for value in values: - if value not in seen: - output.append(value) - seen.add(value) - return output -################################################################################# -#### Regression or Classification type problem -def analyze_problem_type(train, target, verbose=0) : - target = copy.deepcopy(target) - cat_limit = 30 ### this determines the number of categories to name integers as classification ## - float_limit = 15 ### this limits the number of float variable categories for it to become cat var - if isinstance(target, str): - target = [target] - if len(target) == 1: - targ = target[0] - model_label = 'Single_Label' - else: - targ = target[0] - model_label = 'Multi_Label' - #### This is where you detect what kind of problem it is ################# - if train[targ].dtype in ['int64', 'int32','int16']: - if len(train[targ].unique()) <= 2: - model_class = 'Binary_Classification' - elif len(train[targ].unique()) > 2 and len(train[targ].unique()) <= cat_limit: - model_class = 'Multi_Classification' - else: - model_class = 'Regression' - elif train[targ].dtype in ['float']: - if len(train[targ].unique()) <= 2: - model_class = 'Binary_Classification' - elif len(train[targ].unique()) > 2 and len(train[targ].unique()) <= float_limit: - model_class = 'Multi_Classification' - else: - model_class = 'Regression' - else: - if len(train[targ].unique()) <= 2: - model_class = 'Binary_Classification' - else: - model_class = 'Multi_Classification' - ########### print this for the start of next step ########### - if verbose <= 1: - print('''################ %s %s Feature Selection Started #####################''' %( - model_label,model_class)) - return model_class -##################################################################################### -from collections import defaultdict -from collections import OrderedDict -import time -def return_dictionary_list(lst_of_tuples): - """ Returns a dictionary of lists if you send in a list of Tuples""" - orDict = defaultdict(list) - # iterating over list of tuples - for key, val in lst_of_tuples: - orDict[key].append(val) - return orDict -################################################################################## -def remove_variables_using_fast_correlation(df, numvars, modeltype, target, - corr_limit = 0.70,verbose=0): - """ - #### THIS METHOD IS KNOWN AS SULOV METHOD in HONOR OF my mother SULOCHANA SESHADRI ####### - ##### SULOV stands for Searching Uncorrelated List Of Variables ############ - This highly efficient method removes variables that are highly correlated using a series of - pair-wise correlation knockout rounds. It is extremely fast and hence can work on thousands - of variables in less than a minute, even on a laptop. You need to send in a list of numeric - variables and that's all! The method defines high Correlation as anything over 0.70 (absolute) - but this can be changed. If two variables have absolute correlation higher than this, they - will be marked, and using a process of elimination, one of them will get knocked out: - To decide order of variables to keep, we use mutuail information score to select. MIS returns - a ranked list of these correlated variables: when we select one, we knock out others - that it is correlated to. Then we select next var. This way we knock out correlated variables. - Finally we are left with uncorrelated variables that are also highly important in mutual score. - ############## YOU MUST INCLUDE THE ABOVE MESSAGE IF YOU COPY THIS CODE IN YOUR LIBRARY ##### - """ - import copy - target = copy.deepcopy(target) - print('Searching for highly correlated variables from %d variables using SULOV method' %len(numvars)) - print('##### SULOV : Searching for Uncorrelated List Of Variables (takes time...) ############') - correlation_dataframe = df[numvars].corr().abs().astype(np.float16) - ######### This is how you create a dictionary of which var is highly correlated to a list of vars #### - corr_values = correlation_dataframe.values - col_index = correlation_dataframe.columns.tolist() - index_triupper = list(zip(np.triu_indices_from(corr_values,k=1)[0],np.triu_indices_from( - corr_values,k=1)[1])) - high_corr_index_list = [x for x in np.argwhere(abs(corr_values[np.triu_indices(len(corr_values), k = 1)])>=corr_limit)] - low_corr_index_list = [x for x in np.argwhere(abs(corr_values[np.triu_indices(len(corr_values), k = 1)]) 1: - corr_pair_dict[key] += val - else: - corr_pair_dict[key] = val - #### corr_pair_dict is used later to make the network diagram to see which vars are correlated to which - # Selecting upper triangle of correlation matrix ## this is a fast way to find highly correlated vars - upper_tri = correlation_dataframe.where(np.triu(np.ones(correlation_dataframe.shape), - k=1).astype(np.bool)) - empty_df = upper_tri[abs(upper_tri)>corr_limit] - ### if none of the variables are highly correlated, you can skip this whole drawing - if empty_df.isnull().all().all(): - print(' No highly correlated variables in data set to remove. All selected...') - return numvars - #### It's important to find the highly correlated features first ############# - lower_tri = correlation_dataframe.where(np.tril(np.ones(correlation_dataframe.shape), - k=-1).astype(np.bool)) - lower_df = lower_tri[abs(lower_tri)>corr_limit] - corr_list = empty_df.columns[[not(empty_df[x].isnull().all()) for x in list(empty_df)]].tolist( - )+lower_df.columns[[not(lower_df[x].isnull().all()) for x in list(lower_df)]].tolist() - corr_list = find_remove_duplicates(corr_list) - ###### This is for ordering the variables in the highest to lowest importance to target ### - if len(corr_list) == 0: - final_list = list(correlation_dataframe) - print('Selecting all (%d) variables since none of them are highly correlated...' %len(numvars)) - return numvars - else: - if isinstance(target, list): - target = target[0] - max_feats = len(corr_list) - if modeltype == 'Regression': - sel_function = mutual_info_regression - fs = SelectKBest(score_func=sel_function, k=max_feats) - else: - sel_function = mutual_info_classif - fs = SelectKBest(score_func=sel_function, k=max_feats) - try: - fs.fit(df[corr_list].astype(np.float16), df[target]) - mutual_info = dict(zip(corr_list,fs.scores_)) - #### The first variable in list has the highest correlation to the target variable ### - sorted_by_mutual_info =[key for (key,val) in sorted(mutual_info.items(), key=lambda kv: kv[1],reverse=True)] - ##### Now we select the final list of correlated variables ########### - selected_corr_list = [] - #### You have to make multiple copies of this sorted list since it is iterated many times #### - orig_sorted = copy.deepcopy(sorted_by_mutual_info) - copy_sorted = copy.deepcopy(sorted_by_mutual_info) - copy_pair = copy.deepcopy(corr_pair_dict) - #### select each variable by the highest mutual info and see what vars are correlated to it - for each_corr_name in copy_sorted: - ### add the selected var to the selected_corr_list - selected_corr_list.append(each_corr_name) - for each_remove in copy_pair[each_corr_name]: - #### Now remove each variable that is highly correlated to the selected variable - if each_remove in copy_sorted: - copy_sorted.remove(each_remove) - ##### Now we combine the uncorrelated list to the selected correlated list above - rem_col_list = left_subtract(list(correlation_dataframe),corr_list) - final_list = rem_col_list + selected_corr_list - removed_cols = left_subtract(numvars, final_list) - except: - print(' SULOV Method crashing due to memory error, trying alternative simpler method...') - #### Dropping highly correlated Features fast using simple linear correlation ### - removed_cols = remove_highly_correlated_vars_fast(train[numvars],corr_limit) - final_list = left_subtract(numvars, removed_cols) - if len(removed_cols) > 0: - print(' Removing (%d) highly correlated variables:' %(len(removed_cols))) - if len(removed_cols) <= 30: - print(' %s' %removed_cols) - if len(final_list) <= 30: - print(' Following (%d) vars selected: %s' %(len(final_list),final_list)) - ############## D R A W C O R R E L A T I O N N E T W O R K ################## - selected = copy.deepcopy(final_list) - try: - import networkx as nx - except: - print(' Python networkx library not installed. Install it for feature selection visualization.') - return - #### Now start building the graph ################### - gf = nx.Graph() - ### the mutual info score gives the size of the bubble ### - multiplier = 2100 - for each in orig_sorted: - gf.add_node(each, size=int(max(1,mutual_info[each]*multiplier))) - ######### This is where you calculate the size of each node to draw - sizes = [mutual_info[x]*multiplier for x in list(gf.nodes())] - #### The sizes of the bubbles for each node is determined by its mutual information score value - corr = df[corr_list].corr() - high_corr = corr[abs(corr)>corr_limit] - ## high_corr is the dataframe of a few variables that are highly correlated to each other - combos = combinations(corr_list,2) - ### this gives the strength of correlation between 2 nodes ## - multiplier = 20 - for (var1, var2) in combos: - if np.isnan(high_corr.loc[var1,var2]): - pass - else: - gf.add_edge(var1, var2,weight=multiplier*high_corr.loc[var1,var2]) - ######## Now start building the networkx graph ########################## - import copy - widths = nx.get_edge_attributes(gf, 'weight') - nodelist = gf.nodes() - cols = 5 - height_size = 5 - width_size = 15 - rows = int(len(corr_list)/cols) - if rows < 1: - rows = 1 - plt.figure(figsize=(width_size,min(20,height_size*rows))) - pos = nx.shell_layout(gf) - nx.draw_networkx_nodes(gf,pos, - nodelist=nodelist, - node_size=sizes, - node_color='blue', - alpha=0.5) - nx.draw_networkx_edges(gf,pos, - edgelist = widths.keys(), - width=list(widths.values()), - edge_color='lightblue', - alpha=0.6) - pos_higher = {} - x_off = 0.04 # offset on the x axis - y_off = 0.04 # offset on the y axis - for k, v in pos.items(): - pos_higher[k] = (v[0]+x_off, v[1]+y_off) - if len(selected) == 0: - nx.draw_networkx_labels(gf, pos=pos_higher, - labels=dict(zip(nodelist,nodelist)), - font_color='black') - else: - nx.draw_networkx_labels(gf, pos=pos_higher, - labels = dict(zip(nodelist,[x+' (selected)' if x in selected else x for x in nodelist])), - font_color='black') - plt.box(True) - plt.title("""In SULOV, we repeatedly remove features with lower mutual info scores among highly correlated pairs (see figure), - SULOV selects the feature with higher mutual info score related to target when choosing between a pair. """, fontsize=10) - plt.suptitle('How SULOV Method of Removing Highly Correlated Features in a Data Set works', fontsize=20,y=1.03) - red_patch = mpatches.Patch(color='blue', label='Bigger size of circle denotes higher mutual info score with target') - blue_patch = mpatches.Patch(color='lightblue', label='Thicker line width denotes higher correlation between two variables') - plt.legend(handles=[red_patch, blue_patch],loc='best') - plt.show(); - ##### N E T W O R K D I A G R A M C O M P L E T E ################# - return final_list -############################################################################################### -def count_freq_in_list(lst): - """ - This counts the frequency of items in a list but MAINTAINS the order of appearance of items. - This order is very important when you are doing certain functions. Hence this function! - """ - temp=np.unique(lst) - result = [] - for i in temp: - result.append((i,lst.count(i))) - return result -############################################################################################### -def left_subtract(l1,l2): - lst = [] - for i in l1: - if i not in l2: - lst.append(i) - return lst -################################################################################# -def convert_train_test_cat_col_to_numeric(start_train, start_test, col): - """ - #### This is the easiest way to label encode object variables in both train and test - #### This takes care of some categories that are present in train and not in test - ### and vice versa - """ - start_train = copy.deepcopy(start_train) - start_test = copy.deepcopy(start_test) - if start_train[col].isnull().sum() > 0: - start_train[col] = start_train[col].fillna("NA") - train_categs = list(pd.unique(start_train[col].values)) - if not isinstance(start_test,str) : - test_categs = list(pd.unique(start_test[col].values)) - categs_all = train_categs+test_categs - dict_all = return_factorized_dict(categs_all) - else: - dict_all = return_factorized_dict(train_categs) - start_train[col] = start_train[col].map(dict_all) - if not isinstance(start_test,str) : - if start_test[col].isnull().sum() > 0: - start_test[col] = start_test[col].fillna("NA") - start_test[col] = start_test[col].map(dict_all) - return start_train, start_test -############################################################################### -def return_factorized_dict(ls): - """ - ###### Factorize any list of values in a data frame using this neat function - if your data has any NaN's it automatically marks it as -1 and returns that for NaN's - Returns a dictionary mapping previous values with new values. - """ - factos = pd.unique(pd.factorize(ls)[0]) - categs = pd.unique(pd.factorize(ls)[1]) - if -1 in factos: - categs = np.insert(categs,np.where(factos==-1)[0][0],np.nan) - return dict(zip(categs,factos)) -########################################################################################### -############## CONVERSION OF STRING COLUMNS TO NUMERIC WITHOUT LABEL ENCODER ######### -####################################################################################### -import copy -import pdb -def convert_a_column_to_numeric(x, col_dict=""): - '''Function converts any pandas series (or column) consisting of string chars, - into numeric values. It converts an all-string column to an all-number column. - This is an amazing function which performs exactly like a Label Encoding - except that it is simpler and faster''' - if isinstance(col_dict, str): - values = np.unique(x) - values2nums = dict(zip(values,range(len(values)))) - convert_dict = dict(zip(range(len(values)),values)) - return x.replace(values2nums), convert_dict - else: - convert_dict = copy.deepcopy(col_dict) - keys = col_dict.keys() - newkeys = np.unique(x) - rem_keys = left_subtract(newkeys, keys) - max_val = max(col_dict.values()) + 1 - for eachkey in rem_keys: - convert_dict.update({eachkey:max_val}) - max_val += 1 - return x.replace(convert_dict) -####################################################################################### -def convert_a_mixed_object_column_to_numeric(x, col_dict=''): - """ - This is the main utility that converts any string column to numeric. - It does not need Label Encoder since it picks up an string that may not be in test data. - """ - x = x.astype(str) - if isinstance(col_dict, str): - x, convert_dict = convert_a_column_to_numeric(x) - convert_dict = dict([(v,k) for (k,v) in convert_dict.items()]) - return x, convert_dict - else: - x = convert_a_column_to_numeric(x, col_dict) - return x -###################################################################################### -def convert_all_object_columns_to_numeric(train, test=""): - """ - ####################################################################################### - This is a utility that converts string columns to numeric WITHOUT LABEL ENCODER. - The beauty of this utility is that it does not blow up when it finds strings in test not in train. - ####################################################################################### - """ - train = copy.deepcopy(train) - if object in train.dtypes.values: - lis=[] - for row,column in train.dtypes.iteritems(): - if column == object: - lis.append(row) - #print('%d string variables identified' %len(lis)) - for everycol in lis: - #print(' Converting %s to numeric' %everycol) - try: - train[everycol], train_dict = convert_a_mixed_object_column_to_numeric(train[everycol]) - if not isinstance(test, str): - test[everycol],_ = convert_a_mixed_object_column_to_numeric(test[everycol], train_dict) - except: - print('Error converting %s column from string to numeric. Continuing...' %everycol) - continue - return train, test -################################################################################### -from sklearn.feature_selection import chi2, mutual_info_regression, mutual_info_classif -from sklearn.feature_selection import SelectKBest -def featurewiz(dataname, target, corr_limit=0.7, verbose=0, sep=",", header=0): - """ - This is a fast utility that uses XGB to find top features. You - It returns a list of important features. - Since it is XGB, you dont have to restrict the input to just numeric vars. - You can send in all kinds of vars and it will take care of transforming it. Sweet! - """ - train = load_file_dataframe(dataname, sep, header, verbose) - start_time = time.time() - #### If there are more than 30 categorical variables in a data set, it is worth reducing features. - #### Otherwise. XGBoost is pretty good at finding the best features whether cat or numeric ! - n_splits = 5 - max_depth = 8 - max_cats = 5 - ###################### I M P O R T A N T #################################### - subsample = 0.7 - col_sub_sample = 0.7 - test_size = 0.2 - seed = 1 - early_stopping = 5 - ####### All the default parameters are set up now ######### - kf = KFold(n_splits=n_splits, random_state=33) - ###### This is where we set the CPU and GPU parameters for XGBoost - GPU_exists = check_if_GPU_exists() - ##### Set the Scoring Parameters here based on each model and preferences of user ############## - cpu_params = {} - param = {} - cpu_params['nthread'] = -1 - cpu_params['tree_method'] = 'hist' - cpu_params['grow_policy'] = 'depthwise' - cpu_params['max_depth'] = max_depth - cpu_params['max_leaves'] = 0 - cpu_params['verbosity'] = 0 - cpu_params['gpu_id'] = 0 - cpu_params['updater'] = 'grow_colmaker' - cpu_params['predictor'] = 'cpu_predictor' - cpu_params['num_parallel_tree'] = 1 - if GPU_exists: - param['nthread'] = -1 - param['tree_method'] = 'gpu_hist' - param['grow_policy'] = 'depthwise' - param['max_depth'] = max_depth - param['max_leaves'] = 0 - param['verbosity'] = 0 - param['gpu_id'] = 0 - param['updater'] = 'grow_gpu_hist' #'prune' - param['predictor'] = 'gpu_predictor' - param['num_parallel_tree'] = 1 - print(' Running XGBoost using GPU parameters') - else: - param = copy.deepcopy(cpu_params) - print(' Running XGBoost using CPU parameters') - ############################################################################### - if isinstance(target, str): - target = [target] - multi_label = False - else: - if len(target) <= 1: - multi_label = False - else: - multi_label = True - ###### Now we detect the various types of variables to see how to convert them to numeric - features_dict = classify_features(train, target) - cols_to_remove = features_dict['cols_delete'] + features_dict['IDcols'] + features_dict['discrete_string_vars']+features_dict['date_vars'] - preds = [x for x in list(train) if x not in target+cols_to_remove] - numvars = train[preds].select_dtypes(include = 'number').columns.tolist() - catvars = left_subtract(preds, numvars) - rem_vars = copy.deepcopy(catvars) - ########## Now we need to select the right model to run repeatedly #### - if target is None or len(target) == 0: - cols_list = list(train) - modeltype = 'Clustering' - else: - modeltype = analyze_problem_type(train, target) - cols_list = left_subtract(list(train),target) - ###################### I M P O R T A N T ############################################## - ###### This top_num decides how many top_n features XGB selects in each iteration. - #### There a total of 5 iterations. Hence 5x10 means maximum 50 features will be selected. - ##### If there are more than 50 variables, then maximum 25% of its variables will be selected - if len(preds) <= 50: - top_num = 10 - else: - ### the maximum number of variables will 25% of preds which means we divide by 5 and get 5% here - ### The five iterations result in 10% being chosen in each iteration. Hence max 50% of variables! - top_num = int(len(preds)*0.10) - ###################### I M P O R T A N T ############################################## - important_cats = copy.deepcopy(catvars) - ######## Drop Missing value rows since XGB for some reason ######### - ######## can't handle missing values in early stopping rounds ####### - train.dropna(axis=0,subset=preds+target,inplace=True) - if len(numvars) > 1: - final_list = remove_variables_using_fast_correlation(train,numvars,modeltype,target, - corr_limit,verbose) - else: - final_list = copy.deepcopy(numvars) - ####### This is where you draw how featurewiz works when the verbose = 2 ########### - print(' Adding %s categorical variables to reduced numeric variables of %d' %( - len(important_cats),len(final_list))) - if isinstance(final_list,np.ndarray): - final_list = final_list.tolist() - preds = final_list+important_cats - #######You must convert category variables into integers ############### - if len(important_cats) > 0: - train, _ = convert_all_object_columns_to_numeric(train, "") - ######## Dont move this train and y definition anywhere else ######## - y = train[target] - print('############## F E A T U R E S E L E C T I O N ####################') - important_features = [] - ########## This is for Single_Label problems ###################### - if modeltype == 'Regression': - objective = 'reg:squarederror' - model_xgb = XGBRegressor( n_estimators=100,subsample=subsample,objective=objective, - colsample_bytree=col_sub_sample,reg_alpha=0.5, reg_lambda=0.5, - seed=1,n_jobs=-1,random_state=1) - eval_metric = 'rmse' - else: - #### This is for Classifiers only - classes = np.unique(train[target].values) - if len(classes) == 2: - model_xgb = XGBClassifier(base_score=0.5, booster='gbtree', subsample=subsample, - colsample_bytree=col_sub_sample,gamma=1, learning_rate=0.1, max_delta_step=0, - max_depth=max_depth, min_child_weight=1, missing=-999, n_estimators=100, - n_jobs=-1, nthread=None, objective='binary:logistic', - random_state=1, reg_alpha=0.5, reg_lambda=0.5, - seed=1) - eval_metric = 'logloss' - else: - model_xgb = XGBClassifier(base_score=0.5, booster='gbtree', subsample=subsample, - colsample_bytree=col_sub_sample, gamma=1, learning_rate=0.1, max_delta_step=0, - max_depth=max_depth, min_child_weight=1, missing=-999, n_estimators=100, - n_jobs=-1, nthread=None, objective='multi:softmax', - random_state=1, reg_alpha=0.5, reg_lambda=0.5, - seed=1) - eval_metric = 'mlogloss' - #### Now set the parameters for XGBoost ################### - model_xgb.set_params(**param) - #print('Model parameters: %s' %model_xgb) - if multi_label: - ########## This is for multi_label problems ############################### - if modeltype == 'Regression': - model_xgb = MultiOutputRegressor(model_xgb) - #model_xgb = RegressorChain(model_xgb) - else: - ## just do randomized search CV - no need to do one vs rest unless multi-class - model_xgb = MultiOutputClassifier(model_xgb) - #model_xgb = ClassifierChain(model_xgb) - #### This is where you start to Iterate on Finding Important Features ################ - save_xgb = copy.deepcopy(model_xgb) - train_p = train[preds] - if train_p.shape[1] < 10: - iter_limit = 2 - else: - iter_limit = int(train_p.shape[1]/5+0.5) - print('Current number of predictors = %d ' %(train_p.shape[1],)) - print(' Finding Important Features using Boosted Trees algorithm...') - ######## This is where we start training the XGBoost model to find top features #### - try: - for i in range(0,train_p.shape[1],iter_limit): - new_xgb = copy.deepcopy(save_xgb) - print(' using %d variables...' %(train_p.shape[1]-i)) - imp_feats = [] - if train_p.shape[1]-i < iter_limit: - X = train_p.iloc[:,i:] - cols_sel = X.columns.tolist() - if modeltype == 'Regression': - train_part = int((1-test_size)*X.shape[0]) - X_train, X_cv, y_train, y_cv = X[:train_part],X[train_part:],y[:train_part],y[train_part:] - else: - X_train, X_cv, y_train, y_cv = train_test_split(X, y, - test_size=test_size, random_state=seed) - try: - if multi_label: - eval_set = [(X_train.values,y_train.values),(X_cv.values,y_cv.values)] - else: - eval_set = [(X_train,y_train),(X_cv,y_cv)] - if multi_label: - model_xgb.fit(X_train,y_train) - else: - model_xgb.fit(X_train,y_train,early_stopping_rounds=early_stopping,eval_set=eval_set, - eval_metric=eval_metric,verbose=False) - except: - #### On Colab, even though GPU exists, many people don't turn it on. - #### In that case, XGBoost blows up when gpu_predictor is used. - #### This is to turn it back to cpu_predictor in case GPU errors! - if GPU_exists: - print('Error: GPU exists but it is not turned on. Using CPU for predictions...') - if multi_label: - new_xgb.estimator.set_params(**cpu_params) - new_xgb.fit(X_train,y_train) - else: - new_xgb.set_params(**cpu_params) - new_xgb.fit(X_train,y_train,early_stopping_rounds=early_stopping,eval_set=eval_set, - eval_metric=eval_metric,verbose=False) - #### This is where you collect the feature importances from each run ############ - if multi_label: - ### doing this for multi-label is a little different for single label ######### - imp_feats = [model_xgb.estimators_[i].feature_importances_ for i in range(len(target))] - imp_feats_df = pd.DataFrame(imp_feats).T - imp_feats_df.columns = target - imp_feats_df.index = cols_sel - imp_feats_df['sum'] = imp_feats_df.sum(axis=1).values - important_features += imp_feats_df.sort_values(by='sum',ascending=False)[:top_num].index.tolist() - else: - ### doing this for single-label is a little different from multi_label ######### - important_features += pd.Series(model_xgb.get_booster().get_score( - importance_type='gain')).sort_values(ascending=False)[:top_num].index.tolist() - ####### order this in the same order in which they were collected ###### - important_features = list(OrderedDict.fromkeys(important_features)) - else: - X = train_p[list(train_p.columns.values)[i:train_p.shape[1]]] - cols_sel = X.columns.tolist() - #### Split here into train and test ##### - if modeltype == 'Regression': - train_part = int((1-test_size)*X.shape[0]) - X_train, X_cv, y_train, y_cv = X[:train_part],X[train_part:],y[:train_part],y[train_part:] - else: - X_train, X_cv, y_train, y_cv = train_test_split(X, y, - test_size=test_size, random_state=seed) - ### set the validation data as arrays in multi-label case ##### - if multi_label: - eval_set = [(X_train.values,y_train.values),(X_cv.values,y_cv.values)] - else: - eval_set = [(X_train,y_train),(X_cv,y_cv)] - ########## Try training the model now ##################### - try: - if multi_label: - model_xgb.fit(X_train,y_train) - else: - model_xgb.fit(X_train,y_train,early_stopping_rounds=early_stopping, - eval_set=eval_set,eval_metric=eval_metric,verbose=False) - except: - #### On Colab, even though GPU exists, many people don't turn it on. - #### In that case, XGBoost blows up when gpu_predictor is used. - #### This is to turn it back to cpu_predictor in case GPU errors! - if GPU_exists: - print('Error: GPU exists but it is not turned on. Using CPU for predictions...') - if multi_label: - new_xgb.estimator.set_params(**cpu_params) - new_xgb.fit(X_train,y_train) - else: - new_xgb.set_params(**cpu_params) - new_xgb.fit(X_train,y_train,early_stopping_rounds=early_stopping, - eval_set=eval_set,eval_metric=eval_metric,verbose=False) - ### doing this for multi-label is a little different for single label ######### - if multi_label: - imp_feats = [model_xgb.estimators_[i].feature_importances_ for i in range(len(target))] - imp_feats_df = pd.DataFrame(imp_feats).T - imp_feats_df.columns = target - imp_feats_df.index = cols_sel - imp_feats_df['sum'] = imp_feats_df.sum(axis=1).values - important_features += imp_feats_df.sort_values(by='sum',ascending=False)[:top_num].index.tolist() - else: - important_features += pd.Series(model_xgb.get_booster().get_score( - importance_type='gain')).sort_values(ascending=False)[:top_num].index.tolist() - important_features = list(OrderedDict.fromkeys(important_features)) - except: - print('Finding top features using XGB is crashing. Continuing with all predictors...') - important_features = copy.deepcopy(preds) - return important_features - important_features = list(OrderedDict.fromkeys(important_features)) - print('Found %d important features' %len(important_features)) - print(' Time taken (in seconds) = %0.0f' %(time.time()-start_time)) - numvars = [x for x in numvars if x in important_features] - important_cats = [x for x in important_cats if x in important_features] - return important_features -################################################################################ -def remove_highly_correlated_vars_fast(df, corr_limit=0.70): - """ - This is a simple method to remove highly correlated features fast using Pearson's Correlation. - Use this only for float and integer variables. It will automatically select those only. - It can be used for very large data sets where featurewiz has trouble with memory - """ - # Creating correlation matrix - correlation_dataframe = df.corr().abs().astype(np.float16) - # Selecting upper triangle of correlation matrix - upper_tri = correlation_dataframe.where(np.triu(np.ones(correlation_dataframe.shape), - k=1).astype(np.bool)) - # Finding index of feature columns with correlation greater than 0.95 - to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > corr_limit)] - print(); - print('Highly correlated columns to remove: %s' %to_drop) - return to_drop -##################################################################################### -import os -def check_if_GPU_exists(): - GPU_exists = False - try: - from tensorflow.python.client import device_lib - dev_list = device_lib.list_local_devices() - print('Number of GPUs = %d' %len(dev_list)) - for i in range(len(dev_list)): - if 'GPU' == dev_list[i].device_type: - GPU_exists = True - print('%s available' %dev_list[i].device_type) - except: - print('') - if not GPU_exists: - try: - os.environ['NVIDIA_VISIBLE_DEVICES'] - print(' GPU active on this device') - return True - except: - print(' No GPU active on this device') - return False - else: - return True -############################################################################################# -from itertools import combinations -import matplotlib.patches as mpatches -import matplotlib.pyplot as plt -from sklearn.feature_selection import chi2, mutual_info_regression, mutual_info_classif -from sklearn.feature_selection import SelectKBest -###################################################################################### -# Removes duplicates from a list to return unique values - USED ONLYONCE -def find_remove_duplicates(values): - output = [] - seen = set() - for value in values: - if value not in seen: - output.append(value) - seen.add(value) - return output -################################################################################ diff --git a/dist/featurewiz-0.0.6-py3-none-any.whl b/dist/featurewiz-0.0.6-py3-none-any.whl deleted file mode 100644 index ca50d26..0000000 Binary files a/dist/featurewiz-0.0.6-py3-none-any.whl and /dev/null differ diff --git a/dist/featurewiz-0.0.6.tar.gz b/dist/featurewiz-0.0.6.tar.gz deleted file mode 100644 index 3c595ef..0000000 Binary files a/dist/featurewiz-0.0.6.tar.gz and /dev/null differ diff --git a/featurewiz.egg-info/PKG-INFO b/featurewiz.egg-info/PKG-INFO index 4b3d821..39cdef3 100644 --- a/featurewiz.egg-info/PKG-INFO +++ b/featurewiz.egg-info/PKG-INFO @@ -1,171 +1,171 @@ -Metadata-Version: 2.1 -Name: featurewiz -Version: 0.0.6 -Summary: Select Best Features from your data set - any size - now with XGBoost! -Home-page: https://github.com/AutoViML/featurewiz -Author: Ram Seshadri -Author-email: rsesha2001@yahoo.com -License: Apache License 2.0 -Description: # featurewiz - - ![banner](featurewiz_logo.jpg) - - Featurewiz is a new python library for selecting the best features in your data set fast! - (featurewiz logo created using Wix) -

Two methods are used in this version of featurewiz:
- - 1. SULOV -> SULOV means Searching for Uncorrelated List of Variables. The SULOV method is explained in this chart below. THIS METHOD IS KNOWN AS SULOV METHOD in memory of my mom, SULOCHANA SESHADRI. Additionally, SULOV can also mean: “Searching for Uncorrelated List Of Variables” - - Here is a simple way of explaining how it works: -

    -
  1. Find all the pairs of highly correlated variables exceeding a correlation threshold (say absolute(0.7)). -
  2. Then find their MIS score (Mutual Information Score) to the target variable. MIS is a non-parametric scoring method. So its suitable for all kinds of variables and target. -
  3. Now take each pair of correlated variables, then knock off the one with the lower MIS score. -
  4. What’s left is the ones with the highest Information scores and least correlation with each other. -
- - - ![sulov](SULOV.jpg) - - 2. Recursive XGBoost: Once SULOV has selected variables that have high mutual information scores with least less correlation amongst them, we use XGBoost to repeatedly find best features among the remaining variables after SULOV. The Recursive XGBoost method is explained in this chart below. - Once have done SULOV method, now select the best variables using XGBoost feature important but apply it recursively to smaller and smaller sets of data in your data set. This is how it works: -
    -
  1. Select all variables in data set and the full data split into train and valid sets. -
  2. Find top X features (could be 10) on train using valid for early stopping (to prevent over-fitting) -
  3. Then take next set of vars and find top X -
  4. Do this 5 times. Combine all selected features and de-duplicate them. -
- - - ![xgboost](xgboost.jpg) - - 3. Most variables are included: It automatically detects types of variables in your data set and converts them to numeric except date-time, NLP and large-text variables.
- - 4. Feature Engineering: You can add as many variables as you want and as the last step before modeling, you can perform feature selection with featurewiz -

To upgrade to the best, most stable and full-featured version always do the following:
- Use $ pip install featurewiz --upgrade --ignore-installed
- or - pip install git+https://github.com/AutoViML/featurewiz.git
- - ## Table of Contents -

- - ## Background - - Watch this video [video](https://www.youtube.com/embed/ZiNutwPcAU0)
- -

featurewiz was designed for selecting High Performance variables with the fewest steps. - - In most cases, featurewiz builds models with 20%-99% fewer features than your original data set with nearly the same or slightly lower performance (this is based on my trials. Your experience may vary).
-

- featurewiz is every Data Scientist's feature wizard that will:

    -
  1. Automatically pre-process data: you can send in your entire dataframe as is and featurewiz will classify and change/label encode categorical variables changes to help XGBoost processing. That way, you don't have to preprocess your data before using featurewiz
    -
  2. Assist you with variable classification: featurewiz classifies variables automatically. This is very helpful when you have hundreds if not thousands of variables since it can readily identify which of those are numeric vs categorical vs NLP text vs date-time variables and so on.
    -
  3. Perform feature reduction automatically. When you have small data sets and you know your domain well, it is easy to perhaps do EDA and identify which variables are important. But when you have a very large data set with hundreds if not thousands of variables, selecting the best features from your model can mean the difference between a bloated and highly complex model or a simple model with the fewest and most information-rich features. featurewiz uses XGBoost repeatedly to perform feature selection. You must try it on your large data sets and compare!
    -
  4. Explain SULOV method graphically using networkx library so you can see which variables are highly correlated to which ones and which of those have high or low mutual information scores automatically. Just set verbose = 2 to see the graph.
    -
- featurewiz is built using xgboost, numpy, pandas and matplotlib. It should run on most Python 3 Anaconda installations. You won't have to import any special - libraries other than "XGBoost" and "networkx" library. We use "networkx" library for interpretability.
But if you don't have these libraries, featurewiz will install those for you automatically. - - ## Install - - **Prerequsites:** - - - [Anaconda](https://docs.anaconda.com/anaconda/install/) - - To clone featurewiz, it is better to create a new environment, and install the required dependencies: - - To install from PyPi: - - ``` - conda create -n python=3.7 anaconda - conda activate # ON WINDOWS: `source activate ` - pip install autoviml - or - pip install git+https://github.com/AutoViML/featurewiz.git - ``` - - To install from source: - - ``` - cd - git clone git@github.com:AutoViML/featurewiz.git - # or download and unzip https://github.com/AutoViML/featurewiz/archive/master.zip - conda create -n python=3.7 anaconda - conda activate # ON WINDOWS: `source activate ` - cd featurewiz - pip install -r requirements.txt - ``` - - ## Usage - - In the same directory, open a Jupyter Notebook and use this line to import the .py file: - - ``` - from featurewiz import featurewiz - ``` - - Load a data set (any CSV or text file) into a Pandas dataframe and give it the name of the target(s) variable. If you have more than one target, it will handle multi-label targets too. Just give it a list of variables in that case. If you don't have a dataframe, you can simply enter the name and path of the file to load into featurewiz: - - ``` - features = featurewiz( - dataname, - target, - corr_limit=0.7, - verbose=2, - sep=",", - header=0) - ``` - - Finally, it returns the list of variables selected. - - This list is ready for you to now to do further modeling. - - featurewiz works on any Multi-Class, Multi-Label Data Set. So you can have as many target labels as you want. - You don't have to tell featurwiz whether it is a Regression or Classification problem. It will decide that automatically. - - ## API - - **Arguments** - - - `dataname`: could be a datapath+filename or a dataframe. It will detect whether your input is a filename or a dataframe and load it automatically. - - `target`: name of the target variable in the data set. - - `corr_limit`: if you want to set your own threshold for removing variables as highly correlated, then give it here. The default is 0.7 which means variables less than -0.7 and greater than 0.7 in pearson's correlation will be candidates for removal. - - `verbose`: This has 3 possible states: - - `0` limited output. Great for running this silently and getting fast results. - - `1` more verbiage. Great for knowing how results were and making changes to flags in input. - - `2` SULOV charts and output. Great for finding out what happens under the hood for SULOV method. - - **Return values** - - - `features`: the fewest number of features in your model to make it perform well - - ## Maintainers - - * [@AutoViML](https://github.com/AutoViML) - - ## Contributing - - See [the contributing file](CONTRIBUTING.md)! - - PRs accepted. - - ## License - - Apache License 2.0 © 2020 Ram Seshadri - - ## DISCLAIMER - This project is not an official Google project. It is not supported by Google and Google specifically disclaims all warranties as to its quality, merchantability, or fitness for a particular purpose. - -Platform: UNKNOWN -Classifier: Programming Language :: Python :: 3 -Classifier: Operating System :: OS Independent -Description-Content-Type: text/markdown +Metadata-Version: 2.1 +Name: featurewiz +Version: 0.0.6 +Summary: Select Best Features from your data set - any size - now with XGBoost! +Home-page: https://github.com/AutoViML/featurewiz +Author: Ram Seshadri +Author-email: rsesha2001@yahoo.com +License: Apache License 2.0 +Description: # featurewiz + + ![banner](featurewiz_logo.jpg) + + Featurewiz is a new python library for selecting the best features in your data set fast! + (featurewiz logo created using Wix) +

Two methods are used in this version of featurewiz:
+ + 1. SULOV -> SULOV means Searching for Uncorrelated List of Variables. The SULOV method is explained in this chart below. THIS METHOD IS KNOWN AS SULOV METHOD in memory of my mom, SULOCHANA SESHADRI. Additionally, SULOV can also mean: “Searching for Uncorrelated List Of Variables” + + Here is a simple way of explaining how it works: +

    +
  1. Find all the pairs of highly correlated variables exceeding a correlation threshold (say absolute(0.7)). +
  2. Then find their MIS score (Mutual Information Score) to the target variable. MIS is a non-parametric scoring method. So its suitable for all kinds of variables and target. +
  3. Now take each pair of correlated variables, then knock off the one with the lower MIS score. +
  4. What’s left is the ones with the highest Information scores and least correlation with each other. +
+ + + ![sulov](SULOV.jpg) + + 2. Recursive XGBoost: Once SULOV has selected variables that have high mutual information scores with least less correlation amongst them, we use XGBoost to repeatedly find best features among the remaining variables after SULOV. The Recursive XGBoost method is explained in this chart below. + Once have done SULOV method, now select the best variables using XGBoost feature important but apply it recursively to smaller and smaller sets of data in your data set. This is how it works: +
    +
  1. Select all variables in data set and the full data split into train and valid sets. +
  2. Find top X features (could be 10) on train using valid for early stopping (to prevent over-fitting) +
  3. Then take next set of vars and find top X +
  4. Do this 5 times. Combine all selected features and de-duplicate them. +
+ + + ![xgboost](xgboost.jpg) + + 3. Most variables are included: It automatically detects types of variables in your data set and converts them to numeric except date-time, NLP and large-text variables.
+ + 4. Feature Engineering: You can add as many variables as you want and as the last step before modeling, you can perform feature selection with featurewiz +

To upgrade to the best, most stable and full-featured version always do the following:
+ Use $ pip install featurewiz --upgrade --ignore-installed
+ or + pip install git+https://github.com/AutoViML/featurewiz.git
+ + ## Table of Contents +

+ + ## Background + + Watch this video [video](https://www.youtube.com/embed/ZiNutwPcAU0)
+ +

featurewiz was designed for selecting High Performance variables with the fewest steps. + + In most cases, featurewiz builds models with 20%-99% fewer features than your original data set with nearly the same or slightly lower performance (this is based on my trials. Your experience may vary).
+

+ featurewiz is every Data Scientist's feature wizard that will:

    +
  1. Automatically pre-process data: you can send in your entire dataframe as is and featurewiz will classify and change/label encode categorical variables changes to help XGBoost processing. That way, you don't have to preprocess your data before using featurewiz
    +
  2. Assist you with variable classification: featurewiz classifies variables automatically. This is very helpful when you have hundreds if not thousands of variables since it can readily identify which of those are numeric vs categorical vs NLP text vs date-time variables and so on.
    +
  3. Perform feature reduction automatically. When you have small data sets and you know your domain well, it is easy to perhaps do EDA and identify which variables are important. But when you have a very large data set with hundreds if not thousands of variables, selecting the best features from your model can mean the difference between a bloated and highly complex model or a simple model with the fewest and most information-rich features. featurewiz uses XGBoost repeatedly to perform feature selection. You must try it on your large data sets and compare!
    +
  4. Explain SULOV method graphically using networkx library so you can see which variables are highly correlated to which ones and which of those have high or low mutual information scores automatically. Just set verbose = 2 to see the graph.
    +
+ featurewiz is built using xgboost, numpy, pandas and matplotlib. It should run on most Python 3 Anaconda installations. You won't have to import any special + libraries other than "XGBoost" and "networkx" library. We use "networkx" library for interpretability.
But if you don't have these libraries, featurewiz will install those for you automatically. + + ## Install + + **Prerequsites:** + + - [Anaconda](https://docs.anaconda.com/anaconda/install/) + + To clone featurewiz, it is better to create a new environment, and install the required dependencies: + + To install from PyPi: + + ``` + conda create -n python=3.7 anaconda + conda activate # ON WINDOWS: `source activate ` + pip install autoviml + or + pip install git+https://github.com/AutoViML/featurewiz.git + ``` + + To install from source: + + ``` + cd + git clone git@github.com:AutoViML/featurewiz.git + # or download and unzip https://github.com/AutoViML/featurewiz/archive/master.zip + conda create -n python=3.7 anaconda + conda activate # ON WINDOWS: `source activate ` + cd featurewiz + pip install -r requirements.txt + ``` + + ## Usage + + In the same directory, open a Jupyter Notebook and use this line to import the .py file: + + ``` + from featurewiz import featurewiz + ``` + + Load a data set (any CSV or text file) into a Pandas dataframe and give it the name of the target(s) variable. If you have more than one target, it will handle multi-label targets too. Just give it a list of variables in that case. If you don't have a dataframe, you can simply enter the name and path of the file to load into featurewiz: + + ``` + features = featurewiz( + dataname, + target, + corr_limit=0.7, + verbose=2, + sep=",", + header=0) + ``` + + Finally, it returns the list of variables selected. + + This list is ready for you to now to do further modeling. + + featurewiz works on any Multi-Class, Multi-Label Data Set. So you can have as many target labels as you want. + You don't have to tell featurwiz whether it is a Regression or Classification problem. It will decide that automatically. + + ## API + + **Arguments** + + - `dataname`: could be a datapath+filename or a dataframe. It will detect whether your input is a filename or a dataframe and load it automatically. + - `target`: name of the target variable in the data set. + - `corr_limit`: if you want to set your own threshold for removing variables as highly correlated, then give it here. The default is 0.7 which means variables less than -0.7 and greater than 0.7 in pearson's correlation will be candidates for removal. + - `verbose`: This has 3 possible states: + - `0` limited output. Great for running this silently and getting fast results. + - `1` more verbiage. Great for knowing how results were and making changes to flags in input. + - `2` SULOV charts and output. Great for finding out what happens under the hood for SULOV method. + + **Return values** + + - `features`: the fewest number of features in your model to make it perform well + + ## Maintainers + + * [@AutoViML](https://github.com/AutoViML) + + ## Contributing + + See [the contributing file](CONTRIBUTING.md)! + + PRs accepted. + + ## License + + Apache License 2.0 © 2020 Ram Seshadri + + ## DISCLAIMER + This project is not an official Google project. It is not supported by Google and Google specifically disclaims all warranties as to its quality, merchantability, or fitness for a particular purpose. + +Platform: UNKNOWN +Classifier: Programming Language :: Python :: 3 +Classifier: Operating System :: OS Independent +Description-Content-Type: text/markdown diff --git a/featurewiz/__init__.py b/featurewiz/__init__.py index 82e22b9..eb32c03 100644 --- a/featurewiz/__init__.py +++ b/featurewiz/__init__.py @@ -1,23 +1,23 @@ -# -*- coding: utf-8 -*- -################################################################################ -# featurewiz - fast feature selection using one line of code -# Python v3.6+ -# Created by Ram Seshadri -# Licensed under Apache License v2 -################################################################################ -# Version -from .__version__ import __version__ -from .featurewiz import featurewiz, convert_all_object_columns_to_numeric -from .featurewiz import split_one_field_into_many, add_aggregate_primitive_features -from .featurewiz import create_time_series_features -if __name__ == "__main__": - version_number = __version__ - print("""Running featurewiz version: %s. Call by using: - features = featurewiz(dataname, target, corr_limit=0.70, - verbose=2, sep=',', header=0)""" %version_number) -else: - version_number = __version__ - print("""Imported featurewiz version: %s. Call by using: - features = featurewiz(dataname, target, corr_limit=0.70, - verbose=2, sep=',', header=0)""" %version_number) -################################################################################ +# -*- coding: utf-8 -*- +################################################################################ +# featurewiz - fast feature selection using one line of code +# Python v3.6+ +# Created by Ram Seshadri +# Licensed under Apache License v2 +################################################################################ +# Version +from .__version__ import __version__ +from .featurewiz import featurewiz, convert_all_object_columns_to_numeric +from .featurewiz import split_one_field_into_many, add_aggregate_primitive_features +from .featurewiz import create_time_series_features +if __name__ == "__main__": + version_number = __version__ + print("""Running featurewiz version: %s. Call by using: + features = featurewiz(dataname, target, corr_limit=0.70, + verbose=2, sep=',', header=0)""" %version_number) +else: + version_number = __version__ + print("""Imported featurewiz version: %s. Call by using: + features = featurewiz(dataname, target, corr_limit=0.70, + verbose=2, sep=',', header=0)""" %version_number) +################################################################################ diff --git a/featurewiz/__version__.py b/featurewiz/__version__.py index 6db5610..4790b55 100644 --- a/featurewiz/__version__.py +++ b/featurewiz/__version__.py @@ -1,10 +1,10 @@ -# -*- coding: utf-8 -*- -"""Specifies the version of the FeatureWiz package.""" - -__title__ = "featurewiz" -__author__ = "Ram Seshadri" -__description__ = "Fast Feature Selection for any data set, any size" -__url__ = "https://github.com/Auto_ViML/featurewiz.git" -__version__ = "0.0.7" -__license__ = "Apache License 2.0" -__copyright__ = "2020 Google" +# -*- coding: utf-8 -*- +"""Specifies the version of the FeatureWiz package.""" + +__title__ = "featurewiz" +__author__ = "Ram Seshadri" +__description__ = "Fast Feature Selection for any data set, any size" +__url__ = "https://github.com/Auto_ViML/featurewiz.git" +__version__ = "0.0.7" +__license__ = "Apache License 2.0" +__copyright__ = "2020 Google" diff --git a/featurewiz/featurewiz.py b/featurewiz/featurewiz.py index 2997aed..92d7202 100644 --- a/featurewiz/featurewiz.py +++ b/featurewiz/featurewiz.py @@ -1,1330 +1,1330 @@ -############################################################################## -#Copyright 2019 Google LLC -# -#Licensed under the Apache License, Version 2.0 (the "License"); -#you may not use this file except in compliance with the License. -#You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -#Unless required by applicable law or agreed to in writing, software -#distributed under the License is distributed on an "AS IS" BASIS, -#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -#See the License for the specific language governing permissions and -#limitations under the License. -################################################################################# -#### C O D E C A N B E RE-FACTORED W I T H C I T A T I O N B E L O W ### -################################################################################# -############### F E A T U R E W I Z A R D ################## -################ featurewiz library developed by Ram Seshadri ################# -# featurewiz utilizes SULOV METHOD which is a fast method for feature selection # -##### SULOV also means Searching for Uncorrelated List Of Variables (:-) ###### -############### v 0.0.7 ################ -############### A L L R I G H T S R E S E R V E D ################ -################################################################################# -##### This project is not an official Google project. It is not supported by #### -##### Google and Google specifically disclaims all warranties as to its quality,# -##### merchantability, or fitness for a particular purpose. #################### -################################################################################# -import pandas as pd -import numpy as np -from sklearn.model_selection import KFold -from sklearn.model_selection import GridSearchCV -from sklearn.multioutput import MultiOutputClassifier, MultiOutputRegressor -from sklearn.multiclass import OneVsRestClassifier -import xgboost as xgb -from xgboost.sklearn import XGBClassifier -from xgboost.sklearn import XGBRegressor -from sklearn.model_selection import train_test_split -################################################################################ -#### The warnings from Sklearn are so annoying that I have to shut it off ####### -import warnings -warnings.filterwarnings("ignore") -from sklearn.exceptions import DataConversionWarning -warnings.filterwarnings(action='ignore', category=DataConversionWarning) -def warn(*args, **kwargs): - pass -warnings.warn = warn -#################################################################################### -import re -import pdb -import pprint -from itertools import cycle, combinations -from collections import defaultdict, OrderedDict -import copy -import time -import sys -import random -import xlrd -import statsmodels -from io import BytesIO -import base64 -from functools import reduce -import copy -####################################################################################################### -def classify_features(dfte, depVar, verbose=0): - dfte = copy.deepcopy(dfte) - if isinstance(depVar, list): - orig_preds = [x for x in list(dfte) if x not in depVar] - else: - orig_preds = [x for x in list(dfte) if x not in [depVar]] - ################# CLASSIFY COLUMNS HERE ###################### - var_df = classify_columns(dfte[orig_preds], verbose) - ##### Classify Columns ################ - IDcols = var_df['id_vars'] - discrete_string_vars = var_df['nlp_vars']+var_df['discrete_string_vars'] - cols_delete = var_df['cols_delete'] - bool_vars = var_df['string_bool_vars'] + var_df['num_bool_vars'] - int_vars = var_df['int_vars'] - categorical_vars = var_df['cat_vars'] + var_df['factor_vars'] + int_vars + bool_vars - date_vars = var_df['date_vars'] - if len(var_df['continuous_vars'])==0 and len(int_vars)>0: - continuous_vars = var_df['int_vars'] - categorical_vars = left_subtract(categorical_vars, int_vars) - int_vars = [] - else: - continuous_vars = var_df['continuous_vars'] - preds = [x for x in orig_preds if x not in IDcols+cols_delete+discrete_string_vars] - if len(IDcols+cols_delete+discrete_string_vars) == 0: - print(' No variables removed since no ID or low-information variables found in data set') - else: - print(' %d variables removed since they were ID or low-information variables' - %len(IDcols+cols_delete+discrete_string_vars)) - if verbose >= 1: - print(' List of variables removed: %s' %(IDcols+cols_delete+discrete_string_vars)) - ############# Check if there are too many columns to visualize ################ - ppt = pprint.PrettyPrinter(indent=4) - if verbose==1 and len(cols_list) <= max_cols_analyzed: - marthas_columns(dft,verbose) - print(" Columns to delete:") - ppt.pprint(' %s' % cols_delete) - print(" Boolean variables %s ") - ppt.pprint(' %s' % bool_vars) - print(" Categorical variables %s ") - ppt.pprint(' %s' % categorical_vars) - print(" Continuous variables %s " ) - ppt.pprint(' %s' % continuous_vars) - print(" Discrete string variables %s " ) - ppt.pprint(' %s' % discrete_string_vars) - print(" Date and time variables %s " ) - ppt.pprint(' %s' % date_vars) - print(" ID variables %s ") - ppt.pprint(' %s' % IDcols) - print(" Target variable %s ") - ppt.pprint(' %s' % depVar) - elif verbose==1 and len(cols_list) > max_cols_analyzed: - print(' Total columns > %d, too numerous to list.' %max_cols_analyzed) - features_dict = dict([('IDcols',IDcols),('cols_delete',cols_delete),('bool_vars',bool_vars),('categorical_vars',categorical_vars), - ('continuous_vars',continuous_vars),('discrete_string_vars',discrete_string_vars), - ('date_vars',date_vars)]) - return features_dict -####################################################################################################### -def marthas_columns(data,verbose=0): - """ - This program is named in honor of my one of students who came up with the idea for it. - It's a neat way of printing data types and information compared to the boring describe() function in Pandas. - """ - data = data[:] - print('Data Set Shape: %d rows, %d cols' % data.shape) - if data.shape[1] > 30: - print('Too many columns to print') - else: - if verbose==1: - print('Data Set columns info:') - for col in data.columns: - print('* %s: %d nulls, %d unique vals, most common: %s' % ( - col, - data[col].isnull().sum(), - data[col].nunique(), - data[col].value_counts().head(2).to_dict() - )) - print('--------------------------------------------------------------------') -################################################################################ -######### NEW And FAST WAY to CLASSIFY COLUMNS IN A DATA SET ####### -################################################################################ -def classify_columns(df_preds, verbose=0): - """ - Takes a dataframe containing only predictors to be classified into various types. - DO NOT SEND IN A TARGET COLUMN since it will try to include that into various columns. - Returns a data frame containing columns and the class it belongs to such as numeric, - categorical, date or id column, boolean, nlp, discrete_string and cols to delete... - ####### Returns a dictionary with 10 kinds of vars like the following: # continuous_vars,int_vars - # cat_vars,factor_vars, bool_vars,discrete_string_vars,nlp_vars,date_vars,id_vars,cols_delete - """ - train = copy.deepcopy(df_preds) - #### If there are 30 chars are more in a discrete_string_var, it is then considered an NLP variable - max_nlp_char_size = 30 - max_cols_to_print = 30 - print('############## C L A S S I F Y I N G V A R I A B L E S ####################') - print('Classifying variables in data set...') - #### Cat_Limit defines the max number of categories a column can have to be called a categorical colum - cat_limit = 35 - float_limit = 15 #### Make this limit low so that float variables below this limit become cat vars ### - def add(a,b): - return a+b - sum_all_cols = dict() - orig_cols_total = train.shape[1] - #Types of columns - cols_delete = [col for col in list(train) if (len(train[col].value_counts()) == 1 - ) | (train[col].isnull().sum()/len(train) >= 0.90)] - train = train[left_subtract(list(train),cols_delete)] - var_df = pd.Series(dict(train.dtypes)).reset_index(drop=False).rename( - columns={0:'type_of_column'}) - sum_all_cols['cols_delete'] = cols_delete - var_df['bool'] = var_df.apply(lambda x: 1 if x['type_of_column'] in ['bool','object'] - and len(train[x['index']].value_counts()) == 2 else 0, axis=1) - string_bool_vars = list(var_df[(var_df['bool'] ==1)]['index']) - sum_all_cols['string_bool_vars'] = string_bool_vars - var_df['num_bool'] = var_df.apply(lambda x: 1 if x['type_of_column'] in [np.uint8, - np.uint16, np.uint32, np.uint64, - 'int8','int16','int32','int64', - 'float16','float32','float64'] and len( - train[x['index']].value_counts()) == 2 else 0, axis=1) - num_bool_vars = list(var_df[(var_df['num_bool'] ==1)]['index']) - sum_all_cols['num_bool_vars'] = num_bool_vars - ###### This is where we take all Object vars and split them into diff kinds ### - discrete_or_nlp = var_df.apply(lambda x: 1 if x['type_of_column'] in ['object'] and x[ - 'index'] not in string_bool_vars+cols_delete else 0,axis=1) - ######### This is where we figure out whether a string var is nlp or discrete_string var ### - var_df['nlp_strings'] = 0 - var_df['discrete_strings'] = 0 - var_df['cat'] = 0 - var_df['id_col'] = 0 - discrete_or_nlp_vars = var_df.loc[discrete_or_nlp==1]['index'].values.tolist() - if len(var_df.loc[discrete_or_nlp==1]) != 0: - for col in discrete_or_nlp_vars: - #### first fill empty or missing vals since it will blowup ### - train[col] = train[col].fillna(' ') - if train[col].map(lambda x: len(x) if type(x)==str else 0).mean( - ) >= max_nlp_char_size and len(train[col].value_counts() - ) <= int(0.9*len(train)) and col not in string_bool_vars: - var_df.loc[var_df['index']==col,'nlp_strings'] = 1 - elif len(train[col].value_counts()) > cat_limit and len(train[col].value_counts() - ) <= int(0.9*len(train)) and col not in string_bool_vars: - var_df.loc[var_df['index']==col,'discrete_strings'] = 1 - elif len(train[col].value_counts()) > cat_limit and len(train[col].value_counts() - ) == len(train) and col not in string_bool_vars: - var_df.loc[var_df['index']==col,'id_col'] = 1 - else: - var_df.loc[var_df['index']==col,'cat'] = 1 - nlp_vars = list(var_df[(var_df['nlp_strings'] ==1)]['index']) - sum_all_cols['nlp_vars'] = nlp_vars - discrete_string_vars = list(var_df[(var_df['discrete_strings'] ==1) ]['index']) - sum_all_cols['discrete_string_vars'] = discrete_string_vars - ###### This happens only if a string column happens to be an ID column ####### - #### DO NOT Add this to ID_VARS yet. It will be done later.. Dont change it easily... - #### Category DTYPE vars are very special = they can be left as is and not disturbed in Python. ### - var_df['dcat'] = var_df.apply(lambda x: 1 if str(x['type_of_column'])=='category' else 0, - axis=1) - factor_vars = list(var_df[(var_df['dcat'] ==1)]['index']) - sum_all_cols['factor_vars'] = factor_vars - ######################################################################## - date_or_id = var_df.apply(lambda x: 1 if x['type_of_column'] in [np.uint8, - np.uint16, np.uint32, np.uint64, - 'int8','int16', - 'int32','int64'] and x[ - 'index'] not in string_bool_vars+num_bool_vars+discrete_string_vars+nlp_vars else 0, - axis=1) - ######### This is where we figure out whether a numeric col is date or id variable ### - var_df['int'] = 0 - var_df['date_time'] = 0 - ### if a particular column is date-time type, now set it as a date time variable ## - var_df['date_time'] = var_df.apply(lambda x: 1 if x['type_of_column'] in [' 2050: - var_df.loc[var_df['index']==col,'id_col'] = 1 - else: - try: - pd.to_datetime(train[col],infer_datetime_format=True) - var_df.loc[var_df['index']==col,'date_time'] = 1 - except: - var_df.loc[var_df['index']==col,'id_col'] = 1 - else: - if train[col].min() < 1900 or train[col].max() > 2050: - if col not in num_bool_vars: - var_df.loc[var_df['index']==col,'int'] = 1 - else: - try: - pd.to_datetime(train[col],infer_datetime_format=True) - var_df.loc[var_df['index']==col,'date_time'] = 1 - except: - if col not in num_bool_vars: - var_df.loc[var_df['index']==col,'int'] = 1 - else: - pass - int_vars = list(var_df[(var_df['int'] ==1)]['index']) - date_vars = list(var_df[(var_df['date_time'] == 1)]['index']) - id_vars = list(var_df[(var_df['id_col'] == 1)]['index']) - sum_all_cols['int_vars'] = int_vars - copy_date_vars = copy.deepcopy(date_vars) - for date_var in copy_date_vars: - #### This test is to make sure sure date vars are actually date vars - try: - pd.to_datetime(train[date_var],infer_datetime_format=True) - except: - ##### if not a date var, then just add it to delete it from processing - cols_delete.append(date_var) - date_vars.remove(date_var) - sum_all_cols['date_vars'] = date_vars - sum_all_cols['id_vars'] = id_vars - sum_all_cols['cols_delete'] = cols_delete - ## This is an EXTREMELY complicated logic for cat vars. Don't change it unless you test it many times! - var_df['numeric'] = 0 - float_or_cat = var_df.apply(lambda x: 1 if x['type_of_column'] in ['float16', - 'float32','float64'] else 0, - axis=1) - if len(var_df.loc[float_or_cat == 1]) > 0: - for col in var_df.loc[float_or_cat == 1]['index'].values.tolist(): - if len(train[col].value_counts()) > 2 and len(train[col].value_counts() - ) <= float_limit and len(train[col].value_counts()) <= len(train): - var_df.loc[var_df['index']==col,'cat'] = 1 - else: - if col not in num_bool_vars: - var_df.loc[var_df['index']==col,'numeric'] = 1 - cat_vars = list(var_df[(var_df['cat'] ==1)]['index']) - continuous_vars = list(var_df[(var_df['numeric'] ==1)]['index']) - ######## V E R Y I M P O R T A N T ################################################### - ##### There are a couple of extra tests you need to do to remove abberations in cat_vars ### - cat_vars_copy = copy.deepcopy(cat_vars) - for cat in cat_vars_copy: - if df_preds[cat].dtype==float: - continuous_vars.append(cat) - cat_vars.remove(cat) - var_df.loc[var_df['index']==cat,'cat'] = 0 - var_df.loc[var_df['index']==cat,'numeric'] = 1 - elif len(df_preds[cat].value_counts()) == df_preds.shape[0]: - id_vars.append(cat) - cat_vars.remove(cat) - var_df.loc[var_df['index']==cat,'cat'] = 0 - var_df.loc[var_df['index']==cat,'id_col'] = 1 - sum_all_cols['cat_vars'] = cat_vars - sum_all_cols['continuous_vars'] = continuous_vars - sum_all_cols['id_vars'] = id_vars - ###### This is where you consoldate the numbers ########### - var_dict_sum = dict(zip(var_df.values[:,0], var_df.values[:,2:].sum(1))) - for col, sumval in var_dict_sum.items(): - if sumval == 0: - print('%s of type=%s is not classified' %(col,train[col].dtype)) - elif sumval > 1: - print('%s of type=%s is classified into more then one type' %(col,train[col].dtype)) - else: - pass - ############### This is where you print all the types of variables ############## - ####### Returns 8 vars in the following order: continuous_vars,int_vars,cat_vars, - ### string_bool_vars,discrete_string_vars,nlp_vars,date_or_id_vars,cols_delete - if verbose == 1: - print(" Number of Numeric Columns = ", len(continuous_vars)) - print(" Number of Integer-Categorical Columns = ", len(int_vars)) - print(" Number of String-Categorical Columns = ", len(cat_vars)) - print(" Number of Factor-Categorical Columns = ", len(factor_vars)) - print(" Number of String-Boolean Columns = ", len(string_bool_vars)) - print(" Number of Numeric-Boolean Columns = ", len(num_bool_vars)) - print(" Number of Discrete String Columns = ", len(discrete_string_vars)) - print(" Number of NLP String Columns = ", len(nlp_vars)) - print(" Number of Date Time Columns = ", len(date_vars)) - print(" Number of ID Columns = ", len(id_vars)) - print(" Number of Columns to Delete = ", len(cols_delete)) - if verbose == 2: - marthas_columns(df_preds,verbose=1) - print(" Numeric Columns: %s" %continuous_vars[:max_cols_to_print]) - print(" Integer-Categorical Columns: %s" %int_vars[:max_cols_to_print]) - print(" String-Categorical Columns: %s" %cat_vars[:max_cols_to_print]) - print(" Factor-Categorical Columns: %s" %factor_vars[:max_cols_to_print]) - print(" String-Boolean Columns: %s" %string_bool_vars[:max_cols_to_print]) - print(" Numeric-Boolean Columns: %s" %num_bool_vars[:max_cols_to_print]) - print(" Discrete String Columns: %s" %discrete_string_vars[:max_cols_to_print]) - print(" NLP text Columns: %s" %nlp_vars[:max_cols_to_print]) - print(" Date Time Columns: %s" %date_vars[:max_cols_to_print]) - print(" ID Columns: %s" %id_vars[:max_cols_to_print]) - print(" Columns that will not be considered in modeling: %s" %cols_delete[:max_cols_to_print]) - ##### now collect all the column types and column names into a single dictionary to return! - len_sum_all_cols = reduce(add,[len(v) for v in sum_all_cols.values()]) - if len_sum_all_cols == orig_cols_total: - print(' %d Predictors classified...' %orig_cols_total) - print(' This does not include the Target column(s)') - else: - print('No of columns classified %d does not match %d total cols. Continuing...' %( - len_sum_all_cols, orig_cols_total)) - ls = sum_all_cols.values() - flat_list = [item for sublist in ls for item in sublist] - if len(left_subtract(list(train),flat_list)) == 0: - print(' Missing columns = None') - else: - print(' Missing columns = %s' %left_subtract(list(train),flat_list)) - return sum_all_cols -################################################################################# -from collections import Counter -import time -from sklearn.feature_selection import chi2, mutual_info_regression, mutual_info_classif -from sklearn.feature_selection import SelectKBest -################################################################################## -def load_file_dataframe(dataname, sep=",", header=0, verbose=0): - start_time = time.time() - ########################### This is where we load file or data frame ############### - if isinstance(dataname,str): - #### this means they have given file name as a string to load the file ##### - if dataname != '' and dataname.endswith(('csv')): - codex = ['utf-8', 'iso-8859-1', 'cp1252', 'latin1'] - for code in codex: - try: - dfte = pd.read_csv(dataname,sep=sep,index_col=None,encoding=code) - print('Encoder %s chosen to read CSV file' %code) - print('Shape of your Data Set loaded: %s' %(dfte.shape,)) - return dfte - except: - print('Encoding codex %s does not work for this file' %code) - continue - elif dataname.endswith(('xlsx','xls','txt')): - #### It's very important to get header rows in Excel since people put headers anywhere in Excel# - dfte = pd.read_excel(dataname,header=header) - print('Shape of your Data Set loaded: %s' %(dfte.shape,)) - return dfte - else: - print('File not able to be loaded') - return - if isinstance(dataname,pd.DataFrame): - #### this means they have given a dataframe name to use directly in processing ##### - dfte = copy.deepcopy(dataname) - return dfte - else: - print('Dataname input must be a filename with path to that file or a Dataframe') - return -################################################################################## -# Removes duplicates from a list to return unique values - USED ONLYONCE -def find_remove_duplicates(values): - output = [] - seen = set() - for value in values: - if value not in seen: - output.append(value) - seen.add(value) - return output -################################################################################# -#### Regression or Classification type problem -def analyze_problem_type(train, target, verbose=0) : - target = copy.deepcopy(target) - cat_limit = 30 ### this determines the number of categories to name integers as classification ## - float_limit = 15 ### this limits the number of float variable categories for it to become cat var - if isinstance(target, str): - target = [target] - if len(target) == 1: - targ = target[0] - model_label = 'Single_Label' - else: - targ = target[0] - model_label = 'Multi_Label' - #### This is where you detect what kind of problem it is ################# - if train[targ].dtype in ['int64', 'int32','int16']: - if len(train[targ].unique()) <= 2: - model_class = 'Binary_Classification' - elif len(train[targ].unique()) > 2 and len(train[targ].unique()) <= cat_limit: - model_class = 'Multi_Classification' - else: - model_class = 'Regression' - elif train[targ].dtype in ['float']: - if len(train[targ].unique()) <= 2: - model_class = 'Binary_Classification' - elif len(train[targ].unique()) > 2 and len(train[targ].unique()) <= float_limit: - model_class = 'Multi_Classification' - else: - model_class = 'Regression' - else: - if len(train[targ].unique()) <= 2: - model_class = 'Binary_Classification' - else: - model_class = 'Multi_Classification' - ########### print this for the start of next step ########### - if verbose <= 1: - print('''################ %s %s Feature Selection Started #####################''' %( - model_label,model_class)) - return model_class -##################################################################################### -from collections import defaultdict -from collections import OrderedDict -import time -def return_dictionary_list(lst_of_tuples): - """ Returns a dictionary of lists if you send in a list of Tuples""" - orDict = defaultdict(list) - # iterating over list of tuples - for key, val in lst_of_tuples: - orDict[key].append(val) - return orDict -################################################################################## -def remove_variables_using_fast_correlation(df, numvars, modeltype, target, - corr_limit = 0.70,verbose=0): - """ - ########################################################################################## - ##### SULOV stands for Searching Uncorrelated List Of Variables ############ - This highly efficient method removes variables that are highly correlated using a series of - pair-wise correlation knockout rounds. It is extremely fast and hence can work on thousands - of variables in less than a minute, even on a laptop. You need to send in a list of numeric - variables and that's all! The method defines high Correlation as anything over 0.70 (absolute) - but this can be changed. If two variables have absolute correlation higher than this, they - will be marked, and using a process of elimination, one of them will get knocked out: - To decide order of variables to keep, we use mutuail information score to select. MIS returns - a ranked list of these correlated variables: when we select one, we knock out others - that it is correlated to. Then we select next var. This way we knock out correlated variables. - Finally we are left with uncorrelated variables that are also highly important in mutual score. - ############## YOU MUST INCLUDE THE ABOVE MESSAGE IF YOU COPY THIS CODE IN YOUR LIBRARY ##### - """ - import copy - target = copy.deepcopy(target) - print('Searching for highly correlated variables from %d variables using SULOV method' %len(numvars)) - print('##### SULOV : Searching for Uncorrelated List Of Variables (takes time...) ############') - correlation_dataframe = df[numvars].corr().abs().astype(np.float16) - ######### This is how you create a dictionary of which var is highly correlated to a list of vars #### - corr_values = correlation_dataframe.values - col_index = correlation_dataframe.columns.tolist() - index_triupper = list(zip(np.triu_indices_from(corr_values,k=1)[0],np.triu_indices_from( - corr_values,k=1)[1])) - high_corr_index_list = [x for x in np.argwhere(abs(corr_values[np.triu_indices(len(corr_values), k = 1)])>=corr_limit)] - low_corr_index_list = [x for x in np.argwhere(abs(corr_values[np.triu_indices(len(corr_values), k = 1)]) 1: - corr_pair_dict[key] += val - else: - corr_pair_dict[key] = val - #### corr_pair_dict is used later to make the network diagram to see which vars are correlated to which - # Selecting upper triangle of correlation matrix ## this is a fast way to find highly correlated vars - upper_tri = correlation_dataframe.where(np.triu(np.ones(correlation_dataframe.shape), - k=1).astype(np.bool)) - empty_df = upper_tri[abs(upper_tri)>corr_limit] - ### if none of the variables are highly correlated, you can skip this whole drawing - if empty_df.isnull().all().all(): - print(' No highly correlated variables in data set to remove. All selected...') - return numvars - #### It's important to find the highly correlated features first ############# - lower_tri = correlation_dataframe.where(np.tril(np.ones(correlation_dataframe.shape), - k=-1).astype(np.bool)) - lower_df = lower_tri[abs(lower_tri)>corr_limit] - corr_list = empty_df.columns[[not(empty_df[x].isnull().all()) for x in list(empty_df)]].tolist( - )+lower_df.columns[[not(lower_df[x].isnull().all()) for x in list(lower_df)]].tolist() - corr_list = find_remove_duplicates(corr_list) - ###### This is for ordering the variables in the highest to lowest importance to target ### - if len(corr_list) == 0: - final_list = list(correlation_dataframe) - print('Selecting all (%d) variables since none of them are highly correlated...' %len(numvars)) - return numvars - else: - if isinstance(target, list): - target = target[0] - max_feats = len(corr_list) - if modeltype == 'Regression': - sel_function = mutual_info_regression - fs = SelectKBest(score_func=sel_function, k=max_feats) - else: - sel_function = mutual_info_classif - fs = SelectKBest(score_func=sel_function, k=max_feats) - try: - fs.fit(df[corr_list].astype(np.float16), df[target]) - mutual_info = dict(zip(corr_list,fs.scores_)) - #### The first variable in list has the highest correlation to the target variable ### - sorted_by_mutual_info =[key for (key,val) in sorted(mutual_info.items(), key=lambda kv: kv[1],reverse=True)] - ##### Now we select the final list of correlated variables ########### - selected_corr_list = [] - #### You have to make multiple copies of this sorted list since it is iterated many times #### - orig_sorted = copy.deepcopy(sorted_by_mutual_info) - copy_sorted = copy.deepcopy(sorted_by_mutual_info) - copy_pair = copy.deepcopy(corr_pair_dict) - #### select each variable by the highest mutual info and see what vars are correlated to it - for each_corr_name in copy_sorted: - ### add the selected var to the selected_corr_list - selected_corr_list.append(each_corr_name) - for each_remove in copy_pair[each_corr_name]: - #### Now remove each variable that is highly correlated to the selected variable - if each_remove in copy_sorted: - copy_sorted.remove(each_remove) - ##### Now we combine the uncorrelated list to the selected correlated list above - rem_col_list = left_subtract(list(correlation_dataframe),corr_list) - final_list = rem_col_list + selected_corr_list - removed_cols = left_subtract(numvars, final_list) - except: - print(' SULOV Method crashing due to memory error, trying alternative simpler method...') - #### Dropping highly correlated Features fast using simple linear correlation ### - removed_cols = remove_highly_correlated_vars_fast(train[numvars],corr_limit) - final_list = left_subtract(numvars, removed_cols) - if len(removed_cols) > 0: - print(' Removing (%d) highly correlated variables:' %(len(removed_cols))) - if len(removed_cols) <= 30: - print(' %s' %removed_cols) - if len(final_list) <= 30: - print(' Following (%d) vars selected: %s' %(len(final_list),final_list)) - ############## D R A W C O R R E L A T I O N N E T W O R K ################## - selected = copy.deepcopy(final_list) - try: - import networkx as nx - except: - print(' Python networkx library not installed. Install it for feature selection visualization.') - return - #### Now start building the graph ################### - gf = nx.Graph() - ### the mutual info score gives the size of the bubble ### - multiplier = 2100 - for each in orig_sorted: - gf.add_node(each, size=int(max(1,mutual_info[each]*multiplier))) - ######### This is where you calculate the size of each node to draw - sizes = [mutual_info[x]*multiplier for x in list(gf.nodes())] - #### The sizes of the bubbles for each node is determined by its mutual information score value - corr = df[corr_list].corr() - high_corr = corr[abs(corr)>corr_limit] - ## high_corr is the dataframe of a few variables that are highly correlated to each other - combos = combinations(corr_list,2) - ### this gives the strength of correlation between 2 nodes ## - multiplier = 20 - for (var1, var2) in combos: - if np.isnan(high_corr.loc[var1,var2]): - pass - else: - gf.add_edge(var1, var2,weight=multiplier*high_corr.loc[var1,var2]) - ######## Now start building the networkx graph ########################## - import copy - widths = nx.get_edge_attributes(gf, 'weight') - nodelist = gf.nodes() - cols = 5 - height_size = 5 - width_size = 15 - rows = int(len(corr_list)/cols) - if rows < 1: - rows = 1 - plt.figure(figsize=(width_size,min(20,height_size*rows))) - pos = nx.shell_layout(gf) - nx.draw_networkx_nodes(gf,pos, - nodelist=nodelist, - node_size=sizes, - node_color='blue', - alpha=0.5) - nx.draw_networkx_edges(gf,pos, - edgelist = widths.keys(), - width=list(widths.values()), - edge_color='lightblue', - alpha=0.6) - pos_higher = {} - x_off = 0.04 # offset on the x axis - y_off = 0.04 # offset on the y axis - for k, v in pos.items(): - pos_higher[k] = (v[0]+x_off, v[1]+y_off) - if len(selected) == 0: - nx.draw_networkx_labels(gf, pos=pos_higher, - labels=dict(zip(nodelist,nodelist)), - font_color='black') - else: - nx.draw_networkx_labels(gf, pos=pos_higher, - labels = dict(zip(nodelist,[x+' (selected)' if x in selected else x+' (removed)' for x in nodelist])), - font_color='black') - plt.box(True) - plt.title("""In SULOV, we repeatedly remove features with lower mutual info scores among highly correlated pairs (see figure), - SULOV selects the feature with higher mutual info score related to target when choosing between a pair. """, fontsize=10) - plt.suptitle('How SULOV Method of Removing Highly Correlated Features in a Data Set works', fontsize=20,y=1.03) - red_patch = mpatches.Patch(color='blue', label='Bigger size of circle denotes higher mutual info score with target') - blue_patch = mpatches.Patch(color='lightblue', label='Thicker line width denotes higher correlation between two variables') - plt.legend(handles=[red_patch, blue_patch],loc='best') - plt.show(); - ##### N E T W O R K D I A G R A M C O M P L E T E ################# - return final_list -############################################################################################### -def count_freq_in_list(lst): - """ - This counts the frequency of items in a list but MAINTAINS the order of appearance of items. - This order is very important when you are doing certain functions. Hence this function! - """ - temp=np.unique(lst) - result = [] - for i in temp: - result.append((i,lst.count(i))) - return result -############################################################################################### -def left_subtract(l1,l2): - lst = [] - for i in l1: - if i not in l2: - lst.append(i) - return lst -################################################################################# -def convert_train_test_cat_col_to_numeric(start_train, start_test, col): - """ - #### This is the easiest way to label encode object variables in both train and test - #### This takes care of some categories that are present in train and not in test - ### and vice versa - """ - start_train = copy.deepcopy(start_train) - start_test = copy.deepcopy(start_test) - if start_train[col].isnull().sum() > 0: - start_train[col] = start_train[col].fillna("NA") - train_categs = list(pd.unique(start_train[col].values)) - if not isinstance(start_test,str) : - test_categs = list(pd.unique(start_test[col].values)) - categs_all = train_categs+test_categs - dict_all = return_factorized_dict(categs_all) - else: - dict_all = return_factorized_dict(train_categs) - start_train[col] = start_train[col].map(dict_all) - if not isinstance(start_test,str) : - if start_test[col].isnull().sum() > 0: - start_test[col] = start_test[col].fillna("NA") - start_test[col] = start_test[col].map(dict_all) - return start_train, start_test -############################################################################### -def return_factorized_dict(ls): - """ - ###### Factorize any list of values in a data frame using this neat function - if your data has any NaN's it automatically marks it as -1 and returns that for NaN's - Returns a dictionary mapping previous values with new values. - """ - factos = pd.unique(pd.factorize(ls)[0]) - categs = pd.unique(pd.factorize(ls)[1]) - if -1 in factos: - categs = np.insert(categs,np.where(factos==-1)[0][0],np.nan) - return dict(zip(categs,factos)) -########################################################################################### -############## CONVERSION OF STRING COLUMNS TO NUMERIC WITHOUT LABEL ENCODER ######### -####################################################################################### -import copy -import pdb -def convert_a_column_to_numeric(x, col_dict=""): - '''Function converts any pandas series (or column) consisting of string chars, - into numeric values. It converts an all-string column to an all-number column. - This is an amazing function which performs exactly like a Label Encoding - except that it is simpler and faster''' - if isinstance(col_dict, str): - values = np.unique(x) - values2nums = dict(zip(values,range(len(values)))) - convert_dict = dict(zip(range(len(values)),values)) - return x.replace(values2nums), convert_dict - else: - convert_dict = copy.deepcopy(col_dict) - keys = col_dict.keys() - newkeys = np.unique(x) - rem_keys = left_subtract(newkeys, keys) - max_val = max(col_dict.values()) + 1 - for eachkey in rem_keys: - convert_dict.update({eachkey:max_val}) - max_val += 1 - return x.replace(convert_dict) -####################################################################################### -def convert_a_mixed_object_column_to_numeric(x, col_dict=''): - """ - This is the main utility that converts any string column to numeric. - It does not need Label Encoder since it picks up an string that may not be in test data. - """ - x = x.astype(str) - if isinstance(col_dict, str): - x, convert_dict = convert_a_column_to_numeric(x) - convert_dict = dict([(v,k) for (k,v) in convert_dict.items()]) - return x, convert_dict - else: - x = convert_a_column_to_numeric(x, col_dict) - return x, '' -###################################################################################### -def convert_all_object_columns_to_numeric(train, test=""): - """ - ####################################################################################### - This is a utility that converts string columns to numeric WITHOUT LABEL ENCODER. - The beauty of this utility is that it does not blow up when it finds strings in test not in train. - ####################################################################################### - """ - train = copy.deepcopy(train) - lis = [] - lis = train.select_dtypes('object').columns.tolist() + train.select_dtypes('category').columns.tolist() - if not (len(lis)==0): - for everycol in lis: - #print(' Converting %s to numeric' %everycol) - try: - train[everycol], train_dict = convert_a_mixed_object_column_to_numeric(train[everycol]) - if not isinstance(test, str): - test[everycol],_ = convert_a_mixed_object_column_to_numeric(test[everycol], train_dict) - except: - print('Error converting %s column from string to numeric. Continuing...' %everycol) - continue - return train, test -################################################################################### -from sklearn.feature_selection import chi2, mutual_info_regression, mutual_info_classif -from sklearn.feature_selection import SelectKBest -def featurewiz(dataname, target, corr_limit=0.7, verbose=0, sep=",", header=0): - """ - This is a fast utility that uses XGB to find top features. You - It returns a list of important features. - Since it is XGB, you dont have to restrict the input to just numeric vars. - You can send in all kinds of vars and it will take care of transforming it. Sweet! - """ - train = load_file_dataframe(dataname, sep, header, verbose) - start_time = time.time() - #### If there are more than 30 categorical variables in a data set, it is worth reducing features. - #### Otherwise. XGBoost is pretty good at finding the best features whether cat or numeric ! - n_splits = 5 - max_depth = 8 - max_cats = 5 - ###################### I M P O R T A N T #################################### - subsample = 0.7 - col_sub_sample = 0.7 - test_size = 0.2 - seed = 1 - early_stopping = 5 - ####### All the default parameters are set up now ######### - kf = KFold(n_splits=n_splits, random_state=33) - ###### This is where we set the CPU and GPU parameters for XGBoost - GPU_exists = check_if_GPU_exists() - ##### Set the Scoring Parameters here based on each model and preferences of user ############## - cpu_params = {} - param = {} - cpu_params['nthread'] = -1 - cpu_params['tree_method'] = 'hist' - cpu_params['grow_policy'] = 'depthwise' - cpu_params['max_depth'] = max_depth - cpu_params['max_leaves'] = 0 - cpu_params['verbosity'] = 0 - cpu_params['gpu_id'] = 0 - cpu_params['updater'] = 'grow_colmaker' - cpu_params['predictor'] = 'cpu_predictor' - cpu_params['num_parallel_tree'] = 1 - if GPU_exists: - param['nthread'] = -1 - param['tree_method'] = 'gpu_hist' - param['grow_policy'] = 'depthwise' - param['max_depth'] = max_depth - param['max_leaves'] = 0 - param['verbosity'] = 0 - param['gpu_id'] = 0 - param['updater'] = 'grow_gpu_hist' #'prune' - param['predictor'] = 'gpu_predictor' - param['num_parallel_tree'] = 1 - print(' Running XGBoost using GPU parameters') - else: - param = copy.deepcopy(cpu_params) - print(' Running XGBoost using CPU parameters') - ############################################################################### - if isinstance(target, str): - target = [target] - multi_label = False - else: - if len(target) <= 1: - multi_label = False - else: - multi_label = True - ###### Now we detect the various types of variables to see how to convert them to numeric - features_dict = classify_features(train, target) - cols_to_remove = features_dict['cols_delete'] + features_dict['IDcols'] + features_dict['discrete_string_vars']+features_dict['date_vars'] - preds = [x for x in list(train) if x not in target+cols_to_remove] - numvars = train[preds].select_dtypes(include = 'number').columns.tolist() - catvars = left_subtract(preds, numvars) - rem_vars = copy.deepcopy(catvars) - ########## Now we need to select the right model to run repeatedly #### - if target is None or len(target) == 0: - cols_list = list(train) - modeltype = 'Clustering' - else: - modeltype = analyze_problem_type(train, target) - cols_list = left_subtract(list(train),target) - ###################### I M P O R T A N T ############################################## - ###### This top_num decides how many top_n features XGB selects in each iteration. - #### There a total of 5 iterations. Hence 5x10 means maximum 50 features will be selected. - ##### If there are more than 50 variables, then maximum 25% of its variables will be selected - if len(preds) <= 50: - top_num = 10 - else: - ### the maximum number of variables will 25% of preds which means we divide by 5 and get 5% here - ### The five iterations result in 10% being chosen in each iteration. Hence max 50% of variables! - top_num = int(len(preds)*0.10) - ###################### I M P O R T A N T ############################################## - important_cats = copy.deepcopy(catvars) - ######## Drop Missing value rows since XGB for some reason ######### - ######## can't handle missing values in early stopping rounds ####### - train.dropna(axis=0,subset=preds+target,inplace=True) - if len(numvars) > 1: - final_list = remove_variables_using_fast_correlation(train,numvars,modeltype,target, - corr_limit,verbose) - else: - final_list = copy.deepcopy(numvars) - ####### This is where you draw how featurewiz works when the verbose = 2 ########### - print(' Adding %s categorical variables to reduced numeric variables of %d' %( - len(important_cats),len(final_list))) - if isinstance(final_list,np.ndarray): - final_list = final_list.tolist() - preds = final_list+important_cats - #######You must convert category variables into integers ############### - if len(important_cats) > 0: - train, _ = convert_all_object_columns_to_numeric(train, "") - ######## Dont move this train and y definition anywhere else ######## - y = train[target] - print('############## F E A T U R E S E L E C T I O N ####################') - important_features = [] - ########## This is for Single_Label problems ###################### - if modeltype == 'Regression': - objective = 'reg:squarederror' - model_xgb = XGBRegressor( n_estimators=100,subsample=subsample,objective=objective, - colsample_bytree=col_sub_sample,reg_alpha=0.5, reg_lambda=0.5, - seed=1,n_jobs=-1,random_state=1) - eval_metric = 'rmse' - else: - #### This is for Classifiers only - classes = np.unique(train[target].values) - if len(classes) == 2: - model_xgb = XGBClassifier(base_score=0.5, booster='gbtree', subsample=subsample, - colsample_bytree=col_sub_sample,gamma=1, learning_rate=0.1, max_delta_step=0, - max_depth=max_depth, min_child_weight=1, missing=-999, n_estimators=100, - n_jobs=-1, nthread=None, objective='binary:logistic', - random_state=1, reg_alpha=0.5, reg_lambda=0.5, - seed=1) - eval_metric = 'logloss' - else: - model_xgb = XGBClassifier(base_score=0.5, booster='gbtree', subsample=subsample, - colsample_bytree=col_sub_sample, gamma=1, learning_rate=0.1, max_delta_step=0, - max_depth=max_depth, min_child_weight=1, missing=-999, n_estimators=100, - n_jobs=-1, nthread=None, objective='multi:softmax', - random_state=1, reg_alpha=0.5, reg_lambda=0.5, - seed=1) - eval_metric = 'mlogloss' - #### Now set the parameters for XGBoost ################### - model_xgb.set_params(**param) - #print('Model parameters: %s' %model_xgb) - if multi_label: - ########## This is for multi_label problems ############################### - if modeltype == 'Regression': - model_xgb = MultiOutputRegressor(model_xgb) - #model_xgb = RegressorChain(model_xgb) - else: - ## just do randomized search CV - no need to do one vs rest unless multi-class - model_xgb = MultiOutputClassifier(model_xgb) - #model_xgb = ClassifierChain(model_xgb) - #### This is where you start to Iterate on Finding Important Features ################ - save_xgb = copy.deepcopy(model_xgb) - train_p = train[preds] - if train_p.shape[1] < 10: - iter_limit = 2 - else: - iter_limit = int(train_p.shape[1]/5+0.5) - print('Current number of predictors = %d ' %(train_p.shape[1],)) - print(' Finding Important Features using Boosted Trees algorithm...') - ######## This is where we start training the XGBoost model to find top features #### - try: - for i in range(0,train_p.shape[1],iter_limit): - new_xgb = copy.deepcopy(save_xgb) - print(' using %d variables...' %(train_p.shape[1]-i)) - imp_feats = [] - if train_p.shape[1]-i < iter_limit: - X = train_p.iloc[:,i:] - cols_sel = X.columns.tolist() - if modeltype == 'Regression': - train_part = int((1-test_size)*X.shape[0]) - X_train, X_cv, y_train, y_cv = X[:train_part],X[train_part:],y[:train_part],y[train_part:] - else: - X_train, X_cv, y_train, y_cv = train_test_split(X, y, - test_size=test_size, random_state=seed) - try: - if multi_label: - eval_set = [(X_train.values,y_train.values),(X_cv.values,y_cv.values)] - else: - eval_set = [(X_train,y_train),(X_cv,y_cv)] - if multi_label: - model_xgb.fit(X_train,y_train) - else: - model_xgb.fit(X_train,y_train,early_stopping_rounds=early_stopping,eval_set=eval_set, - eval_metric=eval_metric,verbose=False) - except: - #### On Colab, even though GPU exists, many people don't turn it on. - #### In that case, XGBoost blows up when gpu_predictor is used. - #### This is to turn it back to cpu_predictor in case GPU errors! - if GPU_exists: - print('Error: GPU exists but it is not turned on. Using CPU for predictions...') - if multi_label: - new_xgb.estimator.set_params(**cpu_params) - new_xgb.fit(X_train,y_train) - else: - new_xgb.set_params(**cpu_params) - new_xgb.fit(X_train,y_train,early_stopping_rounds=early_stopping,eval_set=eval_set, - eval_metric=eval_metric,verbose=False) - #### This is where you collect the feature importances from each run ############ - if multi_label: - ### doing this for multi-label is a little different for single label ######### - imp_feats = [model_xgb.estimators_[i].feature_importances_ for i in range(len(target))] - imp_feats_df = pd.DataFrame(imp_feats).T - imp_feats_df.columns = target - imp_feats_df.index = cols_sel - imp_feats_df['sum'] = imp_feats_df.sum(axis=1).values - important_features += imp_feats_df.sort_values(by='sum',ascending=False)[:top_num].index.tolist() - else: - ### doing this for single-label is a little different from multi_label ######### - important_features += pd.Series(model_xgb.get_booster().get_score( - importance_type='gain')).sort_values(ascending=False)[:top_num].index.tolist() - ####### order this in the same order in which they were collected ###### - important_features = list(OrderedDict.fromkeys(important_features)) - else: - X = train_p[list(train_p.columns.values)[i:train_p.shape[1]]] - cols_sel = X.columns.tolist() - #### Split here into train and test ##### - if modeltype == 'Regression': - train_part = int((1-test_size)*X.shape[0]) - X_train, X_cv, y_train, y_cv = X[:train_part],X[train_part:],y[:train_part],y[train_part:] - else: - X_train, X_cv, y_train, y_cv = train_test_split(X, y, - test_size=test_size, random_state=seed) - ### set the validation data as arrays in multi-label case ##### - if multi_label: - eval_set = [(X_train.values,y_train.values),(X_cv.values,y_cv.values)] - else: - eval_set = [(X_train,y_train),(X_cv,y_cv)] - ########## Try training the model now ##################### - try: - if multi_label: - model_xgb.fit(X_train,y_train) - else: - model_xgb.fit(X_train,y_train,early_stopping_rounds=early_stopping, - eval_set=eval_set,eval_metric=eval_metric,verbose=False) - except: - #### On Colab, even though GPU exists, many people don't turn it on. - #### In that case, XGBoost blows up when gpu_predictor is used. - #### This is to turn it back to cpu_predictor in case GPU errors! - if GPU_exists: - print('Error: GPU exists but it is not turned on. Using CPU for predictions...') - if multi_label: - new_xgb.estimator.set_params(**cpu_params) - new_xgb.fit(X_train,y_train) - else: - new_xgb.set_params(**cpu_params) - new_xgb.fit(X_train,y_train,early_stopping_rounds=early_stopping, - eval_set=eval_set,eval_metric=eval_metric,verbose=False) - ### doing this for multi-label is a little different for single label ######### - if multi_label: - imp_feats = [model_xgb.estimators_[i].feature_importances_ for i in range(len(target))] - imp_feats_df = pd.DataFrame(imp_feats).T - imp_feats_df.columns = target - imp_feats_df.index = cols_sel - imp_feats_df['sum'] = imp_feats_df.sum(axis=1).values - important_features += imp_feats_df.sort_values(by='sum',ascending=False)[:top_num].index.tolist() - else: - important_features += pd.Series(model_xgb.get_booster().get_score( - importance_type='gain')).sort_values(ascending=False)[:top_num].index.tolist() - important_features = list(OrderedDict.fromkeys(important_features)) - except: - print('Finding top features using XGB is crashing. Continuing with all predictors...') - important_features = copy.deepcopy(preds) - return important_features - important_features = list(OrderedDict.fromkeys(important_features)) - print('Found %d important features' %len(important_features)) - print(' Time taken (in seconds) = %0.0f' %(time.time()-start_time)) - numvars = [x for x in numvars if x in important_features] - important_cats = [x for x in important_cats if x in important_features] - return important_features -################################################################################ -def remove_highly_correlated_vars_fast(df, corr_limit=0.70): - """ - This is a simple method to remove highly correlated features fast using Pearson's Correlation. - Use this only for float and integer variables. It will automatically select those only. - It can be used for very large data sets where featurewiz has trouble with memory - """ - # Creating correlation matrix - correlation_dataframe = df.corr().abs().astype(np.float16) - # Selecting upper triangle of correlation matrix - upper_tri = correlation_dataframe.where(np.triu(np.ones(correlation_dataframe.shape), - k=1).astype(np.bool)) - # Finding index of feature columns with correlation greater than 0.95 - to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > corr_limit)] - print(); - print('Highly correlated columns to remove: %s' %to_drop) - return to_drop -##################################################################################### -import os -def check_if_GPU_exists(): - GPU_exists = False - try: - from tensorflow.python.client import device_lib - dev_list = device_lib.list_local_devices() - print('Number of GPUs = %d' %len(dev_list)) - for i in range(len(dev_list)): - if 'GPU' == dev_list[i].device_type: - GPU_exists = True - print('%s available' %dev_list[i].device_type) - except: - print('') - if not GPU_exists: - try: - os.environ['NVIDIA_VISIBLE_DEVICES'] - print(' GPU active on this device') - return True - except: - print(' No GPU active on this device') - return False - else: - return True -############################################################################################# -from itertools import combinations -import matplotlib.patches as mpatches -import matplotlib.pyplot as plt -from sklearn.feature_selection import chi2, mutual_info_regression, mutual_info_classif -from sklearn.feature_selection import SelectKBest -###################################################################################### -# Removes duplicates from a list to return unique values - USED ONLYONCE -def find_remove_duplicates(values): - output = [] - seen = set() - for value in values: - if value not in seen: - output.append(value) - seen.add(value) - return output -################################################################################ -def add_date_time_features(smalldf, startTime, endTime, splitter_date_string="/",splitter_hour_string=":"): - """ - If you have start date time stamp and end date time stamp, this module will create additional features for such fields. - You must provide a start date time stamp field and if you have an end date time stamp field, you must use it. - Otherwise, you are better off using the create_date_time_features module which is also in this library. - You must provide the following: - smalldf: Dataframe containing your date time fields - startTime: this is hopefully a string field which converts to a date time stamp easily. Make sure it is a string. - endTime: this also must be a string field which converts to a date time stamp easily. Make sure it is a string. - splitter_date_string: usually there is a string such as '/' or '.' between day/month/year etc. Default is assumed / here. - splitter_hour_string: usually there is a string such as ':' or '.' between hour:min:sec etc. Default is assumed : here. - """ - smalldf = smalldf.copy() - add_cols = [] - start_date = 'processing'+startTime+'_start_date' - smalldf[start_date] = smalldf[startTime].map(lambda x: x.split(" ")[0]) - add_cols.append(start_date) - try: - start_time = 'processing'+startTime+'_start_time' - smalldf[start_time] = smalldf[startTime].map(lambda x: x.split(" ")[1]) - add_cols.append(start_time) - except: - ### there is no hour-minutes part of this date time stamp field. You can just skip it if it is not there - pass - end_date = 'processing'+endTime+'_end_date' - smalldf[end_date] = smalldf[endTime].map(lambda x: x.split(" ")[0]) - add_cols.append(end_date) - try: - end_time = 'processing'+endTime+'_end_time' - smalldf[end_time] = smalldf[endTime].map(lambda x: x.split(" ")[1]) - add_cols.append(end_time) - except: - ### there is no hour-minutes part of this date time stamp field. You can just skip it if it is not there - pass - view_days = 'processing'+startTime+'_elapsed_days' - smalldf[view_days] = (pd.to_datetime(smalldf[end_date]) - pd.to_datetime(smalldf[start_date])).values.astype(int) - add_cols.append(view_days) - try: - view_time = 'processing'+startTime+'_elapsed_time' - smalldf[view_time] = (pd.to_datetime(smalldf[end_time]) - pd.to_datetime(smalldf[start_time])).astype('timedelta64[s]').values - add_cols.append(view_time) - except: - ### In some date time fields this gives an error so skip it in that case - pass - #### The reason we chose endTime here is that startTime is usually taken care of by another library. So better to do this alone. - year = 'processing'+endTime+'_end_year' - smalldf[year] = smalldf[end_date].map(lambda x: str(x).split(splitter_date_string)[0]).values - add_cols.append(year) - #### The reason we chose endTime here is that startTime is usually taken care of by another library. So better to do this alone. - month = 'processing'+endTime+'_end_month' - smalldf[month] = smalldf[end_date].map(lambda x: str(x).split(splitter_date_string)[1]).values - add_cols.append(month) - try: - #### The reason we chose endTime here is that startTime is usually taken care of by another library. So better to do this alone. - daynum = 'processing'+endTime+'_end_day_number' - smalldf[daynum] = smalldf[end_date].map(lambda x: str(x).split(splitter_date_string)[2]).values - add_cols.append(daynum) - except: - ### In some date time fields the day number is not there. If not, just skip it #### - pass - #### In some date time fields, the hour and minute is not there, so skip it in that case if it errors! - try: - start_hour = 'processing'+startTime+'_start_hour' - smalldf[start_hour] = smalldf[start_time].map(lambda x: str(x).split(splitter_hour_string)[0]).values - add_cols.append(start_hour) - start_min = 'processing'+startTime+'_start_hour' - smalldf[start_min] = smalldf[start_time].map(lambda x: str(x).split(splitter_hour_string)[1]).values - add_cols.append(start_min) - except: - ### If it errors, skip it - pass - #### Check if there is a weekday and weekends in date time columns using endTime only - weekday_num = 'processing'+endTime+'_end_weekday_number' - smalldf[weekday_num] = pd.to_datetime(smalldf[end_date]).dt.weekday.values - add_cols.append(weekday_num) - weekend = 'processing'+endTime+'_end_weekend_flag' - smalldf[weekend] = smalldf[weekday_num].map(lambda x: 1 if x in[5,6] else 0) - add_cols.append(weekend) - #### If everything works well, there should be 13 new columns added by module. All the best! - print('%d columns added using start date=%s and end date=%s processing...' %(len(add_cols),startTime,endTime)) - return smalldf -########################################################################### -def split_one_field_into_many(df, field, splitter, filler, new_names_list, add_count_field=False): - """ - This little function takes any data frame field (string variables only) and splits - it into as many fields as you want in the new_names_list. - You can also specify what string to split on using the splitter argument. - You can also fill Null values that occur due to your splitting by specifying a filler. - if no new_names_list is given, then we use the name of the field itself to split. - add_count_field: False (default). If True, it will count the number of items in - the "field" column before the split. This may be needed in nested dictionary fields. - """ - import warnings - warnings.filterwarnings("ignore") - df = df.copy() - ### First print the maximum number of things in that field - max_things = df[field].map(lambda x: len(x.split(splitter))).max() - if len(new_names_list) == 0: - print(' Max. columns created by splitting %s field is %d.' %( - field,max_things)) - else: - if not max_things == len(new_names_list): - print(' Max. columns created by splitting %s field is %d but you have given %d variable names only. Selecting first %d' %( - field,max_things,len(new_names_list),len(new_names_list))) - ### This creates a new field that counts the number of things that are in that field. - if add_count_field: - num_products_viewed = 'count_things_in_'+field - df[num_products_viewed] = df[field].map(lambda x: len(x.split(";"))).values - ### Clean up the field such that it has the right number of split chars otherwise add to it - df[field] = df[field].map(lambda x: x+splitter*(max_things-len(x.split(";"))) if len(x.split(";")) < max_things else x) - ###### Now you create new fields by split the one large field ######## - if new_names_list == '': - new_names_list = [field+'_'+str(i) for i in range(1,max_things+1)] - try: - for i in range(len(new_names_list)): - df[field].fillna(filler, inplace=True) - df.loc[df[field] == splitter, field] = filler - df[new_names_list[i]] = df[field].map(lambda x: x.split(splitter)[i] - if splitter in x else x) - except: - ### Check if the column is a string column. If not, give an error message. - print('Cannot split the column. Getting an error. Check the column again') - return df - return df, new_names_list -########################################################################### -def add_aggregate_primitive_features(dft, agg_types, id_column, ignore_variables=[]): - """ - ### Modify Dataframe by adding computational primitive Features using Feature Tools #### - ### What are aggregate primitives? they are to "mean""median","mode","min","max", etc. features - ### Inputs: - ### df: Just sent in the data frame df that you want features added to - ### agg_types: list of computational types: 'mean','median','count', 'max', 'min', 'sum', etc. - ### One caveat: these agg_types must be found in the agg_func of numpy or pandas groupby statement. - ### for example: numpy has 'median','prod','sum','std','var', etc. - they will work! - ### idcolumn: this is to create an index for the dataframe since FT runs on index variable. You can leave it empty string. - ### ignore_variables: list of variables to ignore among numeric variables in data since they may be ID variables. - """ - import copy - ### Make sure the list of functions they send in are acceptable functions. If not, the aggregate will blow up! - func_set = {'count','sum','mean','mad','median','min','max','mode','abs','prod','std','var','sem','skew','kurt','quantile','cumsum','cumprod','cummax','cummin'} - agg_types = list(set(agg_types).intersection(func_set)) - ### If the ignore_variables list is empty, make sure you add the id_column to it so it can be dropped from aggregation. - if len(ignore_variables) == 0: - ignore_variables = [id_column] - ### Select only integer and float variables to do this aggregation on. Be very careful if there are too many vars. - ### This will take time to run in that case. - dft_index = copy.deepcopy(dft[id_column]) - dft_cont = copy.deepcopy(dft.select_dtypes('number').drop(ignore_variables,axis=1)) - dft_cont[id_column] = dft_index - try: - dft_full = dft_cont.groupby(id_column).agg(agg_types) - except: - ### if for some reason, the groupby blows up, then just return the dataframe as is - no changes! - return dft - cols = [x+'_'+y+'_by_'+id_column for (x,y) in dft_full.columns] - dft_full.columns = cols - ### Not every column has useful values. If it is full of just the same value, remove it - _, list_unique_col_ids = np.unique(dft_full, axis = 1, return_index=True) - dft_full = dft_full.iloc[:, list_unique_col_ids] - return dft_full -################################################################################################################################ -import copy -############################################################## -def create_ts_features(df, tscol): - """ - This takes in input a dataframe and a date variable. - It then creates time series features using the pandas .dt.weekday kind of syntax. - It also returns the data frame of added features with each variable as an integer variable. - """ - df = copy.deepcopy(df) - dt_adds = [] - try: - df[tscol+'_hour'] = df[tscol].dt.hour.astype(int) - df[tscol+'_minute'] = df[tscol].dt.minute.astype(int) - dt_adds.append(tscol+'_hour') - dt_adds.append(tscol+'_minute') - except: - print(' Error in creating hour-second derived features. Continuing...') - try: - df[tscol+'_dayofweek'] = df[tscol].dt.dayofweek.astype(int) - dt_adds.append(tscol+'_dayofweek') - df[tscol+'_quarter'] = df[tscol].dt.quarter.astype(int) - dt_adds.append(tscol+'_quarter') - df[tscol+'_month'] = df[tscol].dt.month.astype(int) - dt_adds.append(tscol+'_month') - df[tscol+'_year'] = df[tscol].dt.year.astype(int) - dt_adds.append(tscol+'_year') - today = date.today() - df[tscol+'_age_in_years'] = today.year - df[tscol].dt.year.astype(int) - dt_adds.append(tscol+'_age_in_years') - df[tscol+'_dayofyear'] = df[tscol].dt.dayofyear.astype(int) - dt_adds.append(tscol+'_dayofyear') - df[tscol+'_dayofmonth'] = df[tscol].dt.day.astype(int) - dt_adds.append(tscol+'_dayofmonth') - df[tscol+'_weekofyear'] = df[tscol].dt.weekofyear.astype(int) - dt_adds.append(tscol+'_weekofyear') - weekends = (df[tscol+'_dayofweek'] == 5) | (df[tscol+'_dayofweek'] == 6) - df[tscol+'_weekend'] = 0 - df.loc[weekends, tscol+'_weekend'] = 1 - df[tscol+'_weekend'] = df[tscol+'_weekend'].astype(int) - dt_adds.append(tscol+'_weekend') - except: - print(' Error in creating date time derived features. Continuing...') - df = df[dt_adds].fillna(0).astype(int) - return df -################################################################ -from dateutil.relativedelta import relativedelta -from datetime import date -##### This is a little utility that computes age from year #### -def compute_age(year_string): - today = date.today() - age = relativedelta(today, year_string) - return age.years -################################################################# -def create_time_series_features(dtf, ts_column): - """ - This creates between 8 and 10 date time features for each date variable. The number of features - depends on whether it is just a year variable or a year+month+day and whether it has hours and mins+secs. - So this can create all these features using just the date time column that you send in. - It returns the entire dataframe with added variables as output. - """ - dtf = copy.deepcopy(dtf) - #### If for some reason ts_column is just a number, make sure it is a string so it does not blow up and concatenated - if not isinstance(ts_column,str): - ts_column = str(ts_column) - try: - ### In some extreme cases, date time vars are not processed yet and hence we must fill missing values here! - if dtf[ts_column].isnull().sum() > 0: - missing_flag = True - new_missing_col = ts_column + '_Missing_Flag' - dtf[new_missing_col] = 0 - dtf.loc[dtf[ts_column].isnull(),new_missing_col]=1 - dtf[ts_column] = dtf[ts_column].fillna(method='ffill') - if dtf[ts_column].dtype in [np.float64,np.float32,np.float16]: - dtf[ts_column] = dtf[ts_column].astype(int) - ### if we have already found that it was a date time var, then leave it as it is. Thats good enough! - date_items = dtf[ts_column].apply(str).apply(len).values - #### In some extreme cases, - if all(date_items[0] == item for item in date_items): - if date_items[0] == 4: - ### If it is just a year variable alone, you should leave it as just a year! - age_col = ts_column+'_age_in_years' - dtf[age_col] = dtf[ts_column].map(lambda x: pd.to_datetime(x,format='%Y')).apply(compute_age).values - return dtf[[ts_column,age_col]] - else: - ### if it is not a year alone, then convert it into a date time variable - dtf[ts_column] = pd.to_datetime(dtf[ts_column], infer_datetime_format=True) - else: - dtf[ts_column] = pd.to_datetime(dtf[ts_column], infer_datetime_format=True) - dtf = create_ts_features(dtf,ts_column) - except: - print('Error in Processing %s column for date time features. Continuing...' %ts_column) - return dtf -###################################################################################### +############################################################################## +#Copyright 2019 Google LLC +# +#Licensed under the Apache License, Version 2.0 (the "License"); +#you may not use this file except in compliance with the License. +#You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +#Unless required by applicable law or agreed to in writing, software +#distributed under the License is distributed on an "AS IS" BASIS, +#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +#See the License for the specific language governing permissions and +#limitations under the License. +################################################################################# +#### C O D E C A N B E RE-FACTORED W I T H C I T A T I O N B E L O W ### +################################################################################# +############### F E A T U R E W I Z A R D ################## +################ featurewiz library developed by Ram Seshadri ################# +# featurewiz utilizes SULOV METHOD which is a fast method for feature selection # +##### SULOV also means Searching for Uncorrelated List Of Variables (:-) ###### +############### v 0.0.7 ################ +############### A L L R I G H T S R E S E R V E D ################ +################################################################################# +##### This project is not an official Google project. It is not supported by #### +##### Google and Google specifically disclaims all warranties as to its quality,# +##### merchantability, or fitness for a particular purpose. #################### +################################################################################# +import pandas as pd +import numpy as np +from sklearn.model_selection import KFold +from sklearn.model_selection import GridSearchCV +from sklearn.multioutput import MultiOutputClassifier, MultiOutputRegressor +from sklearn.multiclass import OneVsRestClassifier +import xgboost as xgb +from xgboost.sklearn import XGBClassifier +from xgboost.sklearn import XGBRegressor +from sklearn.model_selection import train_test_split +################################################################################ +#### The warnings from Sklearn are so annoying that I have to shut it off ####### +import warnings +warnings.filterwarnings("ignore") +from sklearn.exceptions import DataConversionWarning +warnings.filterwarnings(action='ignore', category=DataConversionWarning) +def warn(*args, **kwargs): + pass +warnings.warn = warn +#################################################################################### +import re +import pdb +import pprint +from itertools import cycle, combinations +from collections import defaultdict, OrderedDict +import copy +import time +import sys +import random +import xlrd +import statsmodels +from io import BytesIO +import base64 +from functools import reduce +import copy +####################################################################################################### +def classify_features(dfte, depVar, verbose=0): + dfte = copy.deepcopy(dfte) + if isinstance(depVar, list): + orig_preds = [x for x in list(dfte) if x not in depVar] + else: + orig_preds = [x for x in list(dfte) if x not in [depVar]] + ################# CLASSIFY COLUMNS HERE ###################### + var_df = classify_columns(dfte[orig_preds], verbose) + ##### Classify Columns ################ + IDcols = var_df['id_vars'] + discrete_string_vars = var_df['nlp_vars']+var_df['discrete_string_vars'] + cols_delete = var_df['cols_delete'] + bool_vars = var_df['string_bool_vars'] + var_df['num_bool_vars'] + int_vars = var_df['int_vars'] + categorical_vars = var_df['cat_vars'] + var_df['factor_vars'] + int_vars + bool_vars + date_vars = var_df['date_vars'] + if len(var_df['continuous_vars'])==0 and len(int_vars)>0: + continuous_vars = var_df['int_vars'] + categorical_vars = left_subtract(categorical_vars, int_vars) + int_vars = [] + else: + continuous_vars = var_df['continuous_vars'] + preds = [x for x in orig_preds if x not in IDcols+cols_delete+discrete_string_vars] + if len(IDcols+cols_delete+discrete_string_vars) == 0: + print(' No variables removed since no ID or low-information variables found in data set') + else: + print(' %d variables removed since they were ID or low-information variables' + %len(IDcols+cols_delete+discrete_string_vars)) + if verbose >= 1: + print(' List of variables removed: %s' %(IDcols+cols_delete+discrete_string_vars)) + ############# Check if there are too many columns to visualize ################ + ppt = pprint.PrettyPrinter(indent=4) + if verbose==1 and len(cols_list) <= max_cols_analyzed: + marthas_columns(dft,verbose) + print(" Columns to delete:") + ppt.pprint(' %s' % cols_delete) + print(" Boolean variables %s ") + ppt.pprint(' %s' % bool_vars) + print(" Categorical variables %s ") + ppt.pprint(' %s' % categorical_vars) + print(" Continuous variables %s " ) + ppt.pprint(' %s' % continuous_vars) + print(" Discrete string variables %s " ) + ppt.pprint(' %s' % discrete_string_vars) + print(" Date and time variables %s " ) + ppt.pprint(' %s' % date_vars) + print(" ID variables %s ") + ppt.pprint(' %s' % IDcols) + print(" Target variable %s ") + ppt.pprint(' %s' % depVar) + elif verbose==1 and len(cols_list) > max_cols_analyzed: + print(' Total columns > %d, too numerous to list.' %max_cols_analyzed) + features_dict = dict([('IDcols',IDcols),('cols_delete',cols_delete),('bool_vars',bool_vars),('categorical_vars',categorical_vars), + ('continuous_vars',continuous_vars),('discrete_string_vars',discrete_string_vars), + ('date_vars',date_vars)]) + return features_dict +####################################################################################################### +def marthas_columns(data,verbose=0): + """ + This program is named in honor of my one of students who came up with the idea for it. + It's a neat way of printing data types and information compared to the boring describe() function in Pandas. + """ + data = data[:] + print('Data Set Shape: %d rows, %d cols' % data.shape) + if data.shape[1] > 30: + print('Too many columns to print') + else: + if verbose==1: + print('Data Set columns info:') + for col in data.columns: + print('* %s: %d nulls, %d unique vals, most common: %s' % ( + col, + data[col].isnull().sum(), + data[col].nunique(), + data[col].value_counts().head(2).to_dict() + )) + print('--------------------------------------------------------------------') +################################################################################ +######### NEW And FAST WAY to CLASSIFY COLUMNS IN A DATA SET ####### +################################################################################ +def classify_columns(df_preds, verbose=0): + """ + Takes a dataframe containing only predictors to be classified into various types. + DO NOT SEND IN A TARGET COLUMN since it will try to include that into various columns. + Returns a data frame containing columns and the class it belongs to such as numeric, + categorical, date or id column, boolean, nlp, discrete_string and cols to delete... + ####### Returns a dictionary with 10 kinds of vars like the following: # continuous_vars,int_vars + # cat_vars,factor_vars, bool_vars,discrete_string_vars,nlp_vars,date_vars,id_vars,cols_delete + """ + train = copy.deepcopy(df_preds) + #### If there are 30 chars are more in a discrete_string_var, it is then considered an NLP variable + max_nlp_char_size = 30 + max_cols_to_print = 30 + print('############## C L A S S I F Y I N G V A R I A B L E S ####################') + print('Classifying variables in data set...') + #### Cat_Limit defines the max number of categories a column can have to be called a categorical colum + cat_limit = 35 + float_limit = 15 #### Make this limit low so that float variables below this limit become cat vars ### + def add(a,b): + return a+b + sum_all_cols = dict() + orig_cols_total = train.shape[1] + #Types of columns + cols_delete = [col for col in list(train) if (len(train[col].value_counts()) == 1 + ) | (train[col].isnull().sum()/len(train) >= 0.90)] + train = train[left_subtract(list(train),cols_delete)] + var_df = pd.Series(dict(train.dtypes)).reset_index(drop=False).rename( + columns={0:'type_of_column'}) + sum_all_cols['cols_delete'] = cols_delete + var_df['bool'] = var_df.apply(lambda x: 1 if x['type_of_column'] in ['bool','object'] + and len(train[x['index']].value_counts()) == 2 else 0, axis=1) + string_bool_vars = list(var_df[(var_df['bool'] ==1)]['index']) + sum_all_cols['string_bool_vars'] = string_bool_vars + var_df['num_bool'] = var_df.apply(lambda x: 1 if x['type_of_column'] in [np.uint8, + np.uint16, np.uint32, np.uint64, + 'int8','int16','int32','int64', + 'float16','float32','float64'] and len( + train[x['index']].value_counts()) == 2 else 0, axis=1) + num_bool_vars = list(var_df[(var_df['num_bool'] ==1)]['index']) + sum_all_cols['num_bool_vars'] = num_bool_vars + ###### This is where we take all Object vars and split them into diff kinds ### + discrete_or_nlp = var_df.apply(lambda x: 1 if x['type_of_column'] in ['object'] and x[ + 'index'] not in string_bool_vars+cols_delete else 0,axis=1) + ######### This is where we figure out whether a string var is nlp or discrete_string var ### + var_df['nlp_strings'] = 0 + var_df['discrete_strings'] = 0 + var_df['cat'] = 0 + var_df['id_col'] = 0 + discrete_or_nlp_vars = var_df.loc[discrete_or_nlp==1]['index'].values.tolist() + if len(var_df.loc[discrete_or_nlp==1]) != 0: + for col in discrete_or_nlp_vars: + #### first fill empty or missing vals since it will blowup ### + train[col] = train[col].fillna(' ') + if train[col].map(lambda x: len(x) if type(x)==str else 0).mean( + ) >= max_nlp_char_size and len(train[col].value_counts() + ) <= int(0.9*len(train)) and col not in string_bool_vars: + var_df.loc[var_df['index']==col,'nlp_strings'] = 1 + elif len(train[col].value_counts()) > cat_limit and len(train[col].value_counts() + ) <= int(0.9*len(train)) and col not in string_bool_vars: + var_df.loc[var_df['index']==col,'discrete_strings'] = 1 + elif len(train[col].value_counts()) > cat_limit and len(train[col].value_counts() + ) == len(train) and col not in string_bool_vars: + var_df.loc[var_df['index']==col,'id_col'] = 1 + else: + var_df.loc[var_df['index']==col,'cat'] = 1 + nlp_vars = list(var_df[(var_df['nlp_strings'] ==1)]['index']) + sum_all_cols['nlp_vars'] = nlp_vars + discrete_string_vars = list(var_df[(var_df['discrete_strings'] ==1) ]['index']) + sum_all_cols['discrete_string_vars'] = discrete_string_vars + ###### This happens only if a string column happens to be an ID column ####### + #### DO NOT Add this to ID_VARS yet. It will be done later.. Dont change it easily... + #### Category DTYPE vars are very special = they can be left as is and not disturbed in Python. ### + var_df['dcat'] = var_df.apply(lambda x: 1 if str(x['type_of_column'])=='category' else 0, + axis=1) + factor_vars = list(var_df[(var_df['dcat'] ==1)]['index']) + sum_all_cols['factor_vars'] = factor_vars + ######################################################################## + date_or_id = var_df.apply(lambda x: 1 if x['type_of_column'] in [np.uint8, + np.uint16, np.uint32, np.uint64, + 'int8','int16', + 'int32','int64'] and x[ + 'index'] not in string_bool_vars+num_bool_vars+discrete_string_vars+nlp_vars else 0, + axis=1) + ######### This is where we figure out whether a numeric col is date or id variable ### + var_df['int'] = 0 + var_df['date_time'] = 0 + ### if a particular column is date-time type, now set it as a date time variable ## + var_df['date_time'] = var_df.apply(lambda x: 1 if x['type_of_column'] in [' 2050: + var_df.loc[var_df['index']==col,'id_col'] = 1 + else: + try: + pd.to_datetime(train[col],infer_datetime_format=True) + var_df.loc[var_df['index']==col,'date_time'] = 1 + except: + var_df.loc[var_df['index']==col,'id_col'] = 1 + else: + if train[col].min() < 1900 or train[col].max() > 2050: + if col not in num_bool_vars: + var_df.loc[var_df['index']==col,'int'] = 1 + else: + try: + pd.to_datetime(train[col],infer_datetime_format=True) + var_df.loc[var_df['index']==col,'date_time'] = 1 + except: + if col not in num_bool_vars: + var_df.loc[var_df['index']==col,'int'] = 1 + else: + pass + int_vars = list(var_df[(var_df['int'] ==1)]['index']) + date_vars = list(var_df[(var_df['date_time'] == 1)]['index']) + id_vars = list(var_df[(var_df['id_col'] == 1)]['index']) + sum_all_cols['int_vars'] = int_vars + copy_date_vars = copy.deepcopy(date_vars) + for date_var in copy_date_vars: + #### This test is to make sure sure date vars are actually date vars + try: + pd.to_datetime(train[date_var],infer_datetime_format=True) + except: + ##### if not a date var, then just add it to delete it from processing + cols_delete.append(date_var) + date_vars.remove(date_var) + sum_all_cols['date_vars'] = date_vars + sum_all_cols['id_vars'] = id_vars + sum_all_cols['cols_delete'] = cols_delete + ## This is an EXTREMELY complicated logic for cat vars. Don't change it unless you test it many times! + var_df['numeric'] = 0 + float_or_cat = var_df.apply(lambda x: 1 if x['type_of_column'] in ['float16', + 'float32','float64'] else 0, + axis=1) + if len(var_df.loc[float_or_cat == 1]) > 0: + for col in var_df.loc[float_or_cat == 1]['index'].values.tolist(): + if len(train[col].value_counts()) > 2 and len(train[col].value_counts() + ) <= float_limit and len(train[col].value_counts()) <= len(train): + var_df.loc[var_df['index']==col,'cat'] = 1 + else: + if col not in num_bool_vars: + var_df.loc[var_df['index']==col,'numeric'] = 1 + cat_vars = list(var_df[(var_df['cat'] ==1)]['index']) + continuous_vars = list(var_df[(var_df['numeric'] ==1)]['index']) + ######## V E R Y I M P O R T A N T ################################################### + ##### There are a couple of extra tests you need to do to remove abberations in cat_vars ### + cat_vars_copy = copy.deepcopy(cat_vars) + for cat in cat_vars_copy: + if df_preds[cat].dtype==float: + continuous_vars.append(cat) + cat_vars.remove(cat) + var_df.loc[var_df['index']==cat,'cat'] = 0 + var_df.loc[var_df['index']==cat,'numeric'] = 1 + elif len(df_preds[cat].value_counts()) == df_preds.shape[0]: + id_vars.append(cat) + cat_vars.remove(cat) + var_df.loc[var_df['index']==cat,'cat'] = 0 + var_df.loc[var_df['index']==cat,'id_col'] = 1 + sum_all_cols['cat_vars'] = cat_vars + sum_all_cols['continuous_vars'] = continuous_vars + sum_all_cols['id_vars'] = id_vars + ###### This is where you consoldate the numbers ########### + var_dict_sum = dict(zip(var_df.values[:,0], var_df.values[:,2:].sum(1))) + for col, sumval in var_dict_sum.items(): + if sumval == 0: + print('%s of type=%s is not classified' %(col,train[col].dtype)) + elif sumval > 1: + print('%s of type=%s is classified into more then one type' %(col,train[col].dtype)) + else: + pass + ############### This is where you print all the types of variables ############## + ####### Returns 8 vars in the following order: continuous_vars,int_vars,cat_vars, + ### string_bool_vars,discrete_string_vars,nlp_vars,date_or_id_vars,cols_delete + if verbose == 1: + print(" Number of Numeric Columns = ", len(continuous_vars)) + print(" Number of Integer-Categorical Columns = ", len(int_vars)) + print(" Number of String-Categorical Columns = ", len(cat_vars)) + print(" Number of Factor-Categorical Columns = ", len(factor_vars)) + print(" Number of String-Boolean Columns = ", len(string_bool_vars)) + print(" Number of Numeric-Boolean Columns = ", len(num_bool_vars)) + print(" Number of Discrete String Columns = ", len(discrete_string_vars)) + print(" Number of NLP String Columns = ", len(nlp_vars)) + print(" Number of Date Time Columns = ", len(date_vars)) + print(" Number of ID Columns = ", len(id_vars)) + print(" Number of Columns to Delete = ", len(cols_delete)) + if verbose == 2: + marthas_columns(df_preds,verbose=1) + print(" Numeric Columns: %s" %continuous_vars[:max_cols_to_print]) + print(" Integer-Categorical Columns: %s" %int_vars[:max_cols_to_print]) + print(" String-Categorical Columns: %s" %cat_vars[:max_cols_to_print]) + print(" Factor-Categorical Columns: %s" %factor_vars[:max_cols_to_print]) + print(" String-Boolean Columns: %s" %string_bool_vars[:max_cols_to_print]) + print(" Numeric-Boolean Columns: %s" %num_bool_vars[:max_cols_to_print]) + print(" Discrete String Columns: %s" %discrete_string_vars[:max_cols_to_print]) + print(" NLP text Columns: %s" %nlp_vars[:max_cols_to_print]) + print(" Date Time Columns: %s" %date_vars[:max_cols_to_print]) + print(" ID Columns: %s" %id_vars[:max_cols_to_print]) + print(" Columns that will not be considered in modeling: %s" %cols_delete[:max_cols_to_print]) + ##### now collect all the column types and column names into a single dictionary to return! + len_sum_all_cols = reduce(add,[len(v) for v in sum_all_cols.values()]) + if len_sum_all_cols == orig_cols_total: + print(' %d Predictors classified...' %orig_cols_total) + print(' This does not include the Target column(s)') + else: + print('No of columns classified %d does not match %d total cols. Continuing...' %( + len_sum_all_cols, orig_cols_total)) + ls = sum_all_cols.values() + flat_list = [item for sublist in ls for item in sublist] + if len(left_subtract(list(train),flat_list)) == 0: + print(' Missing columns = None') + else: + print(' Missing columns = %s' %left_subtract(list(train),flat_list)) + return sum_all_cols +################################################################################# +from collections import Counter +import time +from sklearn.feature_selection import chi2, mutual_info_regression, mutual_info_classif +from sklearn.feature_selection import SelectKBest +################################################################################## +def load_file_dataframe(dataname, sep=",", header=0, verbose=0): + start_time = time.time() + ########################### This is where we load file or data frame ############### + if isinstance(dataname,str): + #### this means they have given file name as a string to load the file ##### + if dataname != '' and dataname.endswith(('csv')): + codex = ['utf-8', 'iso-8859-1', 'cp1252', 'latin1'] + for code in codex: + try: + dfte = pd.read_csv(dataname,sep=sep,index_col=None,encoding=code) + print('Encoder %s chosen to read CSV file' %code) + print('Shape of your Data Set loaded: %s' %(dfte.shape,)) + return dfte + except: + print('Encoding codex %s does not work for this file' %code) + continue + elif dataname.endswith(('xlsx','xls','txt')): + #### It's very important to get header rows in Excel since people put headers anywhere in Excel# + dfte = pd.read_excel(dataname,header=header) + print('Shape of your Data Set loaded: %s' %(dfte.shape,)) + return dfte + else: + print('File not able to be loaded') + return + if isinstance(dataname,pd.DataFrame): + #### this means they have given a dataframe name to use directly in processing ##### + dfte = copy.deepcopy(dataname) + return dfte + else: + print('Dataname input must be a filename with path to that file or a Dataframe') + return +################################################################################## +# Removes duplicates from a list to return unique values - USED ONLYONCE +def find_remove_duplicates(values): + output = [] + seen = set() + for value in values: + if value not in seen: + output.append(value) + seen.add(value) + return output +################################################################################# +#### Regression or Classification type problem +def analyze_problem_type(train, target, verbose=0) : + target = copy.deepcopy(target) + cat_limit = 30 ### this determines the number of categories to name integers as classification ## + float_limit = 15 ### this limits the number of float variable categories for it to become cat var + if isinstance(target, str): + target = [target] + if len(target) == 1: + targ = target[0] + model_label = 'Single_Label' + else: + targ = target[0] + model_label = 'Multi_Label' + #### This is where you detect what kind of problem it is ################# + if train[targ].dtype in ['int64', 'int32','int16']: + if len(train[targ].unique()) <= 2: + model_class = 'Binary_Classification' + elif len(train[targ].unique()) > 2 and len(train[targ].unique()) <= cat_limit: + model_class = 'Multi_Classification' + else: + model_class = 'Regression' + elif train[targ].dtype in ['float']: + if len(train[targ].unique()) <= 2: + model_class = 'Binary_Classification' + elif len(train[targ].unique()) > 2 and len(train[targ].unique()) <= float_limit: + model_class = 'Multi_Classification' + else: + model_class = 'Regression' + else: + if len(train[targ].unique()) <= 2: + model_class = 'Binary_Classification' + else: + model_class = 'Multi_Classification' + ########### print this for the start of next step ########### + if verbose <= 1: + print('''################ %s %s Feature Selection Started #####################''' %( + model_label,model_class)) + return model_class +##################################################################################### +from collections import defaultdict +from collections import OrderedDict +import time +def return_dictionary_list(lst_of_tuples): + """ Returns a dictionary of lists if you send in a list of Tuples""" + orDict = defaultdict(list) + # iterating over list of tuples + for key, val in lst_of_tuples: + orDict[key].append(val) + return orDict +################################################################################## +def remove_variables_using_fast_correlation(df, numvars, modeltype, target, + corr_limit = 0.70,verbose=0): + """ + ########################################################################################## + ##### SULOV stands for Searching Uncorrelated List Of Variables ############ + This highly efficient method removes variables that are highly correlated using a series of + pair-wise correlation knockout rounds. It is extremely fast and hence can work on thousands + of variables in less than a minute, even on a laptop. You need to send in a list of numeric + variables and that's all! The method defines high Correlation as anything over 0.70 (absolute) + but this can be changed. If two variables have absolute correlation higher than this, they + will be marked, and using a process of elimination, one of them will get knocked out: + To decide order of variables to keep, we use mutuail information score to select. MIS returns + a ranked list of these correlated variables: when we select one, we knock out others + that it is correlated to. Then we select next var. This way we knock out correlated variables. + Finally we are left with uncorrelated variables that are also highly important in mutual score. + ############## YOU MUST INCLUDE THE ABOVE MESSAGE IF YOU COPY THIS CODE IN YOUR LIBRARY ##### + """ + import copy + target = copy.deepcopy(target) + print('Searching for highly correlated variables from %d variables using SULOV method' %len(numvars)) + print('##### SULOV : Searching for Uncorrelated List Of Variables (takes time...) ############') + correlation_dataframe = df[numvars].corr().abs().astype(np.float16) + ######### This is how you create a dictionary of which var is highly correlated to a list of vars #### + corr_values = correlation_dataframe.values + col_index = correlation_dataframe.columns.tolist() + index_triupper = list(zip(np.triu_indices_from(corr_values,k=1)[0],np.triu_indices_from( + corr_values,k=1)[1])) + high_corr_index_list = [x for x in np.argwhere(abs(corr_values[np.triu_indices(len(corr_values), k = 1)])>=corr_limit)] + low_corr_index_list = [x for x in np.argwhere(abs(corr_values[np.triu_indices(len(corr_values), k = 1)]) 1: + corr_pair_dict[key] += val + else: + corr_pair_dict[key] = val + #### corr_pair_dict is used later to make the network diagram to see which vars are correlated to which + # Selecting upper triangle of correlation matrix ## this is a fast way to find highly correlated vars + upper_tri = correlation_dataframe.where(np.triu(np.ones(correlation_dataframe.shape), + k=1).astype(np.bool)) + empty_df = upper_tri[abs(upper_tri)>corr_limit] + ### if none of the variables are highly correlated, you can skip this whole drawing + if empty_df.isnull().all().all(): + print(' No highly correlated variables in data set to remove. All selected...') + return numvars + #### It's important to find the highly correlated features first ############# + lower_tri = correlation_dataframe.where(np.tril(np.ones(correlation_dataframe.shape), + k=-1).astype(np.bool)) + lower_df = lower_tri[abs(lower_tri)>corr_limit] + corr_list = empty_df.columns[[not(empty_df[x].isnull().all()) for x in list(empty_df)]].tolist( + )+lower_df.columns[[not(lower_df[x].isnull().all()) for x in list(lower_df)]].tolist() + corr_list = find_remove_duplicates(corr_list) + ###### This is for ordering the variables in the highest to lowest importance to target ### + if len(corr_list) == 0: + final_list = list(correlation_dataframe) + print('Selecting all (%d) variables since none of them are highly correlated...' %len(numvars)) + return numvars + else: + if isinstance(target, list): + target = target[0] + max_feats = len(corr_list) + if modeltype == 'Regression': + sel_function = mutual_info_regression + fs = SelectKBest(score_func=sel_function, k=max_feats) + else: + sel_function = mutual_info_classif + fs = SelectKBest(score_func=sel_function, k=max_feats) + try: + fs.fit(df[corr_list].astype(np.float16), df[target]) + mutual_info = dict(zip(corr_list,fs.scores_)) + #### The first variable in list has the highest correlation to the target variable ### + sorted_by_mutual_info =[key for (key,val) in sorted(mutual_info.items(), key=lambda kv: kv[1],reverse=True)] + ##### Now we select the final list of correlated variables ########### + selected_corr_list = [] + #### You have to make multiple copies of this sorted list since it is iterated many times #### + orig_sorted = copy.deepcopy(sorted_by_mutual_info) + copy_sorted = copy.deepcopy(sorted_by_mutual_info) + copy_pair = copy.deepcopy(corr_pair_dict) + #### select each variable by the highest mutual info and see what vars are correlated to it + for each_corr_name in copy_sorted: + ### add the selected var to the selected_corr_list + selected_corr_list.append(each_corr_name) + for each_remove in copy_pair[each_corr_name]: + #### Now remove each variable that is highly correlated to the selected variable + if each_remove in copy_sorted: + copy_sorted.remove(each_remove) + ##### Now we combine the uncorrelated list to the selected correlated list above + rem_col_list = left_subtract(list(correlation_dataframe),corr_list) + final_list = rem_col_list + selected_corr_list + removed_cols = left_subtract(numvars, final_list) + except: + print(' SULOV Method crashing due to memory error, trying alternative simpler method...') + #### Dropping highly correlated Features fast using simple linear correlation ### + removed_cols = remove_highly_correlated_vars_fast(train[numvars],corr_limit) + final_list = left_subtract(numvars, removed_cols) + if len(removed_cols) > 0: + print(' Removing (%d) highly correlated variables:' %(len(removed_cols))) + if len(removed_cols) <= 30: + print(' %s' %removed_cols) + if len(final_list) <= 30: + print(' Following (%d) vars selected: %s' %(len(final_list),final_list)) + ############## D R A W C O R R E L A T I O N N E T W O R K ################## + selected = copy.deepcopy(final_list) + try: + import networkx as nx + except: + print(' Python networkx library not installed. Install it for feature selection visualization.') + return + #### Now start building the graph ################### + gf = nx.Graph() + ### the mutual info score gives the size of the bubble ### + multiplier = 2100 + for each in orig_sorted: + gf.add_node(each, size=int(max(1,mutual_info[each]*multiplier))) + ######### This is where you calculate the size of each node to draw + sizes = [mutual_info[x]*multiplier for x in list(gf.nodes())] + #### The sizes of the bubbles for each node is determined by its mutual information score value + corr = df[corr_list].corr() + high_corr = corr[abs(corr)>corr_limit] + ## high_corr is the dataframe of a few variables that are highly correlated to each other + combos = combinations(corr_list,2) + ### this gives the strength of correlation between 2 nodes ## + multiplier = 20 + for (var1, var2) in combos: + if np.isnan(high_corr.loc[var1,var2]): + pass + else: + gf.add_edge(var1, var2,weight=multiplier*high_corr.loc[var1,var2]) + ######## Now start building the networkx graph ########################## + import copy + widths = nx.get_edge_attributes(gf, 'weight') + nodelist = gf.nodes() + cols = 5 + height_size = 5 + width_size = 15 + rows = int(len(corr_list)/cols) + if rows < 1: + rows = 1 + plt.figure(figsize=(width_size,min(20,height_size*rows))) + pos = nx.shell_layout(gf) + nx.draw_networkx_nodes(gf,pos, + nodelist=nodelist, + node_size=sizes, + node_color='blue', + alpha=0.5) + nx.draw_networkx_edges(gf,pos, + edgelist = widths.keys(), + width=list(widths.values()), + edge_color='lightblue', + alpha=0.6) + pos_higher = {} + x_off = 0.04 # offset on the x axis + y_off = 0.04 # offset on the y axis + for k, v in pos.items(): + pos_higher[k] = (v[0]+x_off, v[1]+y_off) + if len(selected) == 0: + nx.draw_networkx_labels(gf, pos=pos_higher, + labels=dict(zip(nodelist,nodelist)), + font_color='black') + else: + nx.draw_networkx_labels(gf, pos=pos_higher, + labels = dict(zip(nodelist,[x+' (selected)' if x in selected else x+' (removed)' for x in nodelist])), + font_color='black') + plt.box(True) + plt.title("""In SULOV, we repeatedly remove features with lower mutual info scores among highly correlated pairs (see figure), + SULOV selects the feature with higher mutual info score related to target when choosing between a pair. """, fontsize=10) + plt.suptitle('How SULOV Method of Removing Highly Correlated Features in a Data Set works', fontsize=20,y=1.03) + red_patch = mpatches.Patch(color='blue', label='Bigger size of circle denotes higher mutual info score with target') + blue_patch = mpatches.Patch(color='lightblue', label='Thicker line width denotes higher correlation between two variables') + plt.legend(handles=[red_patch, blue_patch],loc='best') + plt.show(); + ##### N E T W O R K D I A G R A M C O M P L E T E ################# + return final_list +############################################################################################### +def count_freq_in_list(lst): + """ + This counts the frequency of items in a list but MAINTAINS the order of appearance of items. + This order is very important when you are doing certain functions. Hence this function! + """ + temp=np.unique(lst) + result = [] + for i in temp: + result.append((i,lst.count(i))) + return result +############################################################################################### +def left_subtract(l1,l2): + lst = [] + for i in l1: + if i not in l2: + lst.append(i) + return lst +################################################################################# +def convert_train_test_cat_col_to_numeric(start_train, start_test, col): + """ + #### This is the easiest way to label encode object variables in both train and test + #### This takes care of some categories that are present in train and not in test + ### and vice versa + """ + start_train = copy.deepcopy(start_train) + start_test = copy.deepcopy(start_test) + if start_train[col].isnull().sum() > 0: + start_train[col] = start_train[col].fillna("NA") + train_categs = list(pd.unique(start_train[col].values)) + if not isinstance(start_test,str) : + test_categs = list(pd.unique(start_test[col].values)) + categs_all = train_categs+test_categs + dict_all = return_factorized_dict(categs_all) + else: + dict_all = return_factorized_dict(train_categs) + start_train[col] = start_train[col].map(dict_all) + if not isinstance(start_test,str) : + if start_test[col].isnull().sum() > 0: + start_test[col] = start_test[col].fillna("NA") + start_test[col] = start_test[col].map(dict_all) + return start_train, start_test +############################################################################### +def return_factorized_dict(ls): + """ + ###### Factorize any list of values in a data frame using this neat function + if your data has any NaN's it automatically marks it as -1 and returns that for NaN's + Returns a dictionary mapping previous values with new values. + """ + factos = pd.unique(pd.factorize(ls)[0]) + categs = pd.unique(pd.factorize(ls)[1]) + if -1 in factos: + categs = np.insert(categs,np.where(factos==-1)[0][0],np.nan) + return dict(zip(categs,factos)) +########################################################################################### +############## CONVERSION OF STRING COLUMNS TO NUMERIC WITHOUT LABEL ENCODER ######### +####################################################################################### +import copy +import pdb +def convert_a_column_to_numeric(x, col_dict=""): + '''Function converts any pandas series (or column) consisting of string chars, + into numeric values. It converts an all-string column to an all-number column. + This is an amazing function which performs exactly like a Label Encoding + except that it is simpler and faster''' + if isinstance(col_dict, str): + values = np.unique(x) + values2nums = dict(zip(values,range(len(values)))) + convert_dict = dict(zip(range(len(values)),values)) + return x.replace(values2nums), convert_dict + else: + convert_dict = copy.deepcopy(col_dict) + keys = col_dict.keys() + newkeys = np.unique(x) + rem_keys = left_subtract(newkeys, keys) + max_val = max(col_dict.values()) + 1 + for eachkey in rem_keys: + convert_dict.update({eachkey:max_val}) + max_val += 1 + return x.replace(convert_dict) +####################################################################################### +def convert_a_mixed_object_column_to_numeric(x, col_dict=''): + """ + This is the main utility that converts any string column to numeric. + It does not need Label Encoder since it picks up an string that may not be in test data. + """ + x = x.astype(str) + if isinstance(col_dict, str): + x, convert_dict = convert_a_column_to_numeric(x) + convert_dict = dict([(v,k) for (k,v) in convert_dict.items()]) + return x, convert_dict + else: + x = convert_a_column_to_numeric(x, col_dict) + return x, '' +###################################################################################### +def convert_all_object_columns_to_numeric(train, test=""): + """ + ####################################################################################### + This is a utility that converts string columns to numeric WITHOUT LABEL ENCODER. + The beauty of this utility is that it does not blow up when it finds strings in test not in train. + ####################################################################################### + """ + train = copy.deepcopy(train) + lis = [] + lis = train.select_dtypes('object').columns.tolist() + train.select_dtypes('category').columns.tolist() + if not (len(lis)==0): + for everycol in lis: + #print(' Converting %s to numeric' %everycol) + try: + train[everycol], train_dict = convert_a_mixed_object_column_to_numeric(train[everycol]) + if not isinstance(test, str): + test[everycol],_ = convert_a_mixed_object_column_to_numeric(test[everycol], train_dict) + except: + print('Error converting %s column from string to numeric. Continuing...' %everycol) + continue + return train, test +################################################################################### +from sklearn.feature_selection import chi2, mutual_info_regression, mutual_info_classif +from sklearn.feature_selection import SelectKBest +def featurewiz(dataname, target, corr_limit=0.7, verbose=0, sep=",", header=0): + """ + This is a fast utility that uses XGB to find top features. You + It returns a list of important features. + Since it is XGB, you dont have to restrict the input to just numeric vars. + You can send in all kinds of vars and it will take care of transforming it. Sweet! + """ + train = load_file_dataframe(dataname, sep, header, verbose) + start_time = time.time() + #### If there are more than 30 categorical variables in a data set, it is worth reducing features. + #### Otherwise. XGBoost is pretty good at finding the best features whether cat or numeric ! + n_splits = 5 + max_depth = 8 + max_cats = 5 + ###################### I M P O R T A N T #################################### + subsample = 0.7 + col_sub_sample = 0.7 + test_size = 0.2 + seed = 1 + early_stopping = 5 + ####### All the default parameters are set up now ######### + kf = KFold(n_splits=n_splits, random_state=33) + ###### This is where we set the CPU and GPU parameters for XGBoost + GPU_exists = check_if_GPU_exists() + ##### Set the Scoring Parameters here based on each model and preferences of user ############## + cpu_params = {} + param = {} + cpu_params['nthread'] = -1 + cpu_params['tree_method'] = 'hist' + cpu_params['grow_policy'] = 'depthwise' + cpu_params['max_depth'] = max_depth + cpu_params['max_leaves'] = 0 + cpu_params['verbosity'] = 0 + cpu_params['gpu_id'] = 0 + cpu_params['updater'] = 'grow_colmaker' + cpu_params['predictor'] = 'cpu_predictor' + cpu_params['num_parallel_tree'] = 1 + if GPU_exists: + param['nthread'] = -1 + param['tree_method'] = 'gpu_hist' + param['grow_policy'] = 'depthwise' + param['max_depth'] = max_depth + param['max_leaves'] = 0 + param['verbosity'] = 0 + param['gpu_id'] = 0 + param['updater'] = 'grow_gpu_hist' #'prune' + param['predictor'] = 'gpu_predictor' + param['num_parallel_tree'] = 1 + print(' Running XGBoost using GPU parameters') + else: + param = copy.deepcopy(cpu_params) + print(' Running XGBoost using CPU parameters') + ############################################################################### + if isinstance(target, str): + target = [target] + multi_label = False + else: + if len(target) <= 1: + multi_label = False + else: + multi_label = True + ###### Now we detect the various types of variables to see how to convert them to numeric + features_dict = classify_features(train, target) + cols_to_remove = features_dict['cols_delete'] + features_dict['IDcols'] + features_dict['discrete_string_vars']+features_dict['date_vars'] + preds = [x for x in list(train) if x not in target+cols_to_remove] + numvars = train[preds].select_dtypes(include = 'number').columns.tolist() + catvars = left_subtract(preds, numvars) + rem_vars = copy.deepcopy(catvars) + ########## Now we need to select the right model to run repeatedly #### + if target is None or len(target) == 0: + cols_list = list(train) + modeltype = 'Clustering' + else: + modeltype = analyze_problem_type(train, target) + cols_list = left_subtract(list(train),target) + ###################### I M P O R T A N T ############################################## + ###### This top_num decides how many top_n features XGB selects in each iteration. + #### There a total of 5 iterations. Hence 5x10 means maximum 50 features will be selected. + ##### If there are more than 50 variables, then maximum 25% of its variables will be selected + if len(preds) <= 50: + top_num = 10 + else: + ### the maximum number of variables will 25% of preds which means we divide by 5 and get 5% here + ### The five iterations result in 10% being chosen in each iteration. Hence max 50% of variables! + top_num = int(len(preds)*0.10) + ###################### I M P O R T A N T ############################################## + important_cats = copy.deepcopy(catvars) + ######## Drop Missing value rows since XGB for some reason ######### + ######## can't handle missing values in early stopping rounds ####### + train.dropna(axis=0,subset=preds+target,inplace=True) + if len(numvars) > 1: + final_list = remove_variables_using_fast_correlation(train,numvars,modeltype,target, + corr_limit,verbose) + else: + final_list = copy.deepcopy(numvars) + ####### This is where you draw how featurewiz works when the verbose = 2 ########### + print(' Adding %s categorical variables to reduced numeric variables of %d' %( + len(important_cats),len(final_list))) + if isinstance(final_list,np.ndarray): + final_list = final_list.tolist() + preds = final_list+important_cats + #######You must convert category variables into integers ############### + if len(important_cats) > 0: + train, _ = convert_all_object_columns_to_numeric(train, "") + ######## Dont move this train and y definition anywhere else ######## + y = train[target] + print('############## F E A T U R E S E L E C T I O N ####################') + important_features = [] + ########## This is for Single_Label problems ###################### + if modeltype == 'Regression': + objective = 'reg:squarederror' + model_xgb = XGBRegressor( n_estimators=100,subsample=subsample,objective=objective, + colsample_bytree=col_sub_sample,reg_alpha=0.5, reg_lambda=0.5, + seed=1,n_jobs=-1,random_state=1) + eval_metric = 'rmse' + else: + #### This is for Classifiers only + classes = np.unique(train[target].values) + if len(classes) == 2: + model_xgb = XGBClassifier(base_score=0.5, booster='gbtree', subsample=subsample, + colsample_bytree=col_sub_sample,gamma=1, learning_rate=0.1, max_delta_step=0, + max_depth=max_depth, min_child_weight=1, missing=-999, n_estimators=100, + n_jobs=-1, nthread=None, objective='binary:logistic', + random_state=1, reg_alpha=0.5, reg_lambda=0.5, + seed=1) + eval_metric = 'logloss' + else: + model_xgb = XGBClassifier(base_score=0.5, booster='gbtree', subsample=subsample, + colsample_bytree=col_sub_sample, gamma=1, learning_rate=0.1, max_delta_step=0, + max_depth=max_depth, min_child_weight=1, missing=-999, n_estimators=100, + n_jobs=-1, nthread=None, objective='multi:softmax', + random_state=1, reg_alpha=0.5, reg_lambda=0.5, + seed=1) + eval_metric = 'mlogloss' + #### Now set the parameters for XGBoost ################### + model_xgb.set_params(**param) + #print('Model parameters: %s' %model_xgb) + if multi_label: + ########## This is for multi_label problems ############################### + if modeltype == 'Regression': + model_xgb = MultiOutputRegressor(model_xgb) + #model_xgb = RegressorChain(model_xgb) + else: + ## just do randomized search CV - no need to do one vs rest unless multi-class + model_xgb = MultiOutputClassifier(model_xgb) + #model_xgb = ClassifierChain(model_xgb) + #### This is where you start to Iterate on Finding Important Features ################ + save_xgb = copy.deepcopy(model_xgb) + train_p = train[preds] + if train_p.shape[1] < 10: + iter_limit = 2 + else: + iter_limit = int(train_p.shape[1]/5+0.5) + print('Current number of predictors = %d ' %(train_p.shape[1],)) + print(' Finding Important Features using Boosted Trees algorithm...') + ######## This is where we start training the XGBoost model to find top features #### + try: + for i in range(0,train_p.shape[1],iter_limit): + new_xgb = copy.deepcopy(save_xgb) + print(' using %d variables...' %(train_p.shape[1]-i)) + imp_feats = [] + if train_p.shape[1]-i < iter_limit: + X = train_p.iloc[:,i:] + cols_sel = X.columns.tolist() + if modeltype == 'Regression': + train_part = int((1-test_size)*X.shape[0]) + X_train, X_cv, y_train, y_cv = X[:train_part],X[train_part:],y[:train_part],y[train_part:] + else: + X_train, X_cv, y_train, y_cv = train_test_split(X, y, + test_size=test_size, random_state=seed) + try: + if multi_label: + eval_set = [(X_train.values,y_train.values),(X_cv.values,y_cv.values)] + else: + eval_set = [(X_train,y_train),(X_cv,y_cv)] + if multi_label: + model_xgb.fit(X_train,y_train) + else: + model_xgb.fit(X_train,y_train,early_stopping_rounds=early_stopping,eval_set=eval_set, + eval_metric=eval_metric,verbose=False) + except: + #### On Colab, even though GPU exists, many people don't turn it on. + #### In that case, XGBoost blows up when gpu_predictor is used. + #### This is to turn it back to cpu_predictor in case GPU errors! + if GPU_exists: + print('Error: GPU exists but it is not turned on. Using CPU for predictions...') + if multi_label: + new_xgb.estimator.set_params(**cpu_params) + new_xgb.fit(X_train,y_train) + else: + new_xgb.set_params(**cpu_params) + new_xgb.fit(X_train,y_train,early_stopping_rounds=early_stopping,eval_set=eval_set, + eval_metric=eval_metric,verbose=False) + #### This is where you collect the feature importances from each run ############ + if multi_label: + ### doing this for multi-label is a little different for single label ######### + imp_feats = [model_xgb.estimators_[i].feature_importances_ for i in range(len(target))] + imp_feats_df = pd.DataFrame(imp_feats).T + imp_feats_df.columns = target + imp_feats_df.index = cols_sel + imp_feats_df['sum'] = imp_feats_df.sum(axis=1).values + important_features += imp_feats_df.sort_values(by='sum',ascending=False)[:top_num].index.tolist() + else: + ### doing this for single-label is a little different from multi_label ######### + important_features += pd.Series(model_xgb.get_booster().get_score( + importance_type='gain')).sort_values(ascending=False)[:top_num].index.tolist() + ####### order this in the same order in which they were collected ###### + important_features = list(OrderedDict.fromkeys(important_features)) + else: + X = train_p[list(train_p.columns.values)[i:train_p.shape[1]]] + cols_sel = X.columns.tolist() + #### Split here into train and test ##### + if modeltype == 'Regression': + train_part = int((1-test_size)*X.shape[0]) + X_train, X_cv, y_train, y_cv = X[:train_part],X[train_part:],y[:train_part],y[train_part:] + else: + X_train, X_cv, y_train, y_cv = train_test_split(X, y, + test_size=test_size, random_state=seed) + ### set the validation data as arrays in multi-label case ##### + if multi_label: + eval_set = [(X_train.values,y_train.values),(X_cv.values,y_cv.values)] + else: + eval_set = [(X_train,y_train),(X_cv,y_cv)] + ########## Try training the model now ##################### + try: + if multi_label: + model_xgb.fit(X_train,y_train) + else: + model_xgb.fit(X_train,y_train,early_stopping_rounds=early_stopping, + eval_set=eval_set,eval_metric=eval_metric,verbose=False) + except: + #### On Colab, even though GPU exists, many people don't turn it on. + #### In that case, XGBoost blows up when gpu_predictor is used. + #### This is to turn it back to cpu_predictor in case GPU errors! + if GPU_exists: + print('Error: GPU exists but it is not turned on. Using CPU for predictions...') + if multi_label: + new_xgb.estimator.set_params(**cpu_params) + new_xgb.fit(X_train,y_train) + else: + new_xgb.set_params(**cpu_params) + new_xgb.fit(X_train,y_train,early_stopping_rounds=early_stopping, + eval_set=eval_set,eval_metric=eval_metric,verbose=False) + ### doing this for multi-label is a little different for single label ######### + if multi_label: + imp_feats = [model_xgb.estimators_[i].feature_importances_ for i in range(len(target))] + imp_feats_df = pd.DataFrame(imp_feats).T + imp_feats_df.columns = target + imp_feats_df.index = cols_sel + imp_feats_df['sum'] = imp_feats_df.sum(axis=1).values + important_features += imp_feats_df.sort_values(by='sum',ascending=False)[:top_num].index.tolist() + else: + important_features += pd.Series(model_xgb.get_booster().get_score( + importance_type='gain')).sort_values(ascending=False)[:top_num].index.tolist() + important_features = list(OrderedDict.fromkeys(important_features)) + except: + print('Finding top features using XGB is crashing. Continuing with all predictors...') + important_features = copy.deepcopy(preds) + return important_features + important_features = list(OrderedDict.fromkeys(important_features)) + print('Found %d important features' %len(important_features)) + print(' Time taken (in seconds) = %0.0f' %(time.time()-start_time)) + numvars = [x for x in numvars if x in important_features] + important_cats = [x for x in important_cats if x in important_features] + return important_features +################################################################################ +def remove_highly_correlated_vars_fast(df, corr_limit=0.70): + """ + This is a simple method to remove highly correlated features fast using Pearson's Correlation. + Use this only for float and integer variables. It will automatically select those only. + It can be used for very large data sets where featurewiz has trouble with memory + """ + # Creating correlation matrix + correlation_dataframe = df.corr().abs().astype(np.float16) + # Selecting upper triangle of correlation matrix + upper_tri = correlation_dataframe.where(np.triu(np.ones(correlation_dataframe.shape), + k=1).astype(np.bool)) + # Finding index of feature columns with correlation greater than 0.95 + to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > corr_limit)] + print(); + print('Highly correlated columns to remove: %s' %to_drop) + return to_drop +##################################################################################### +import os +def check_if_GPU_exists(): + GPU_exists = False + try: + from tensorflow.python.client import device_lib + dev_list = device_lib.list_local_devices() + print('Number of GPUs = %d' %len(dev_list)) + for i in range(len(dev_list)): + if 'GPU' == dev_list[i].device_type: + GPU_exists = True + print('%s available' %dev_list[i].device_type) + except: + print('') + if not GPU_exists: + try: + os.environ['NVIDIA_VISIBLE_DEVICES'] + print(' GPU active on this device') + return True + except: + print(' No GPU active on this device') + return False + else: + return True +############################################################################################# +from itertools import combinations +import matplotlib.patches as mpatches +import matplotlib.pyplot as plt +from sklearn.feature_selection import chi2, mutual_info_regression, mutual_info_classif +from sklearn.feature_selection import SelectKBest +###################################################################################### +# Removes duplicates from a list to return unique values - USED ONLYONCE +def find_remove_duplicates(values): + output = [] + seen = set() + for value in values: + if value not in seen: + output.append(value) + seen.add(value) + return output +################################################################################ +def add_date_time_features(smalldf, startTime, endTime, splitter_date_string="/",splitter_hour_string=":"): + """ + If you have start date time stamp and end date time stamp, this module will create additional features for such fields. + You must provide a start date time stamp field and if you have an end date time stamp field, you must use it. + Otherwise, you are better off using the create_date_time_features module which is also in this library. + You must provide the following: + smalldf: Dataframe containing your date time fields + startTime: this is hopefully a string field which converts to a date time stamp easily. Make sure it is a string. + endTime: this also must be a string field which converts to a date time stamp easily. Make sure it is a string. + splitter_date_string: usually there is a string such as '/' or '.' between day/month/year etc. Default is assumed / here. + splitter_hour_string: usually there is a string such as ':' or '.' between hour:min:sec etc. Default is assumed : here. + """ + smalldf = smalldf.copy() + add_cols = [] + start_date = 'processing'+startTime+'_start_date' + smalldf[start_date] = smalldf[startTime].map(lambda x: x.split(" ")[0]) + add_cols.append(start_date) + try: + start_time = 'processing'+startTime+'_start_time' + smalldf[start_time] = smalldf[startTime].map(lambda x: x.split(" ")[1]) + add_cols.append(start_time) + except: + ### there is no hour-minutes part of this date time stamp field. You can just skip it if it is not there + pass + end_date = 'processing'+endTime+'_end_date' + smalldf[end_date] = smalldf[endTime].map(lambda x: x.split(" ")[0]) + add_cols.append(end_date) + try: + end_time = 'processing'+endTime+'_end_time' + smalldf[end_time] = smalldf[endTime].map(lambda x: x.split(" ")[1]) + add_cols.append(end_time) + except: + ### there is no hour-minutes part of this date time stamp field. You can just skip it if it is not there + pass + view_days = 'processing'+startTime+'_elapsed_days' + smalldf[view_days] = (pd.to_datetime(smalldf[end_date]) - pd.to_datetime(smalldf[start_date])).values.astype(int) + add_cols.append(view_days) + try: + view_time = 'processing'+startTime+'_elapsed_time' + smalldf[view_time] = (pd.to_datetime(smalldf[end_time]) - pd.to_datetime(smalldf[start_time])).astype('timedelta64[s]').values + add_cols.append(view_time) + except: + ### In some date time fields this gives an error so skip it in that case + pass + #### The reason we chose endTime here is that startTime is usually taken care of by another library. So better to do this alone. + year = 'processing'+endTime+'_end_year' + smalldf[year] = smalldf[end_date].map(lambda x: str(x).split(splitter_date_string)[0]).values + add_cols.append(year) + #### The reason we chose endTime here is that startTime is usually taken care of by another library. So better to do this alone. + month = 'processing'+endTime+'_end_month' + smalldf[month] = smalldf[end_date].map(lambda x: str(x).split(splitter_date_string)[1]).values + add_cols.append(month) + try: + #### The reason we chose endTime here is that startTime is usually taken care of by another library. So better to do this alone. + daynum = 'processing'+endTime+'_end_day_number' + smalldf[daynum] = smalldf[end_date].map(lambda x: str(x).split(splitter_date_string)[2]).values + add_cols.append(daynum) + except: + ### In some date time fields the day number is not there. If not, just skip it #### + pass + #### In some date time fields, the hour and minute is not there, so skip it in that case if it errors! + try: + start_hour = 'processing'+startTime+'_start_hour' + smalldf[start_hour] = smalldf[start_time].map(lambda x: str(x).split(splitter_hour_string)[0]).values + add_cols.append(start_hour) + start_min = 'processing'+startTime+'_start_hour' + smalldf[start_min] = smalldf[start_time].map(lambda x: str(x).split(splitter_hour_string)[1]).values + add_cols.append(start_min) + except: + ### If it errors, skip it + pass + #### Check if there is a weekday and weekends in date time columns using endTime only + weekday_num = 'processing'+endTime+'_end_weekday_number' + smalldf[weekday_num] = pd.to_datetime(smalldf[end_date]).dt.weekday.values + add_cols.append(weekday_num) + weekend = 'processing'+endTime+'_end_weekend_flag' + smalldf[weekend] = smalldf[weekday_num].map(lambda x: 1 if x in[5,6] else 0) + add_cols.append(weekend) + #### If everything works well, there should be 13 new columns added by module. All the best! + print('%d columns added using start date=%s and end date=%s processing...' %(len(add_cols),startTime,endTime)) + return smalldf +########################################################################### +def split_one_field_into_many(df, field, splitter, filler, new_names_list, add_count_field=False): + """ + This little function takes any data frame field (string variables only) and splits + it into as many fields as you want in the new_names_list. + You can also specify what string to split on using the splitter argument. + You can also fill Null values that occur due to your splitting by specifying a filler. + if no new_names_list is given, then we use the name of the field itself to split. + add_count_field: False (default). If True, it will count the number of items in + the "field" column before the split. This may be needed in nested dictionary fields. + """ + import warnings + warnings.filterwarnings("ignore") + df = df.copy() + ### First print the maximum number of things in that field + max_things = df[field].map(lambda x: len(x.split(splitter))).max() + if len(new_names_list) == 0: + print(' Max. columns created by splitting %s field is %d.' %( + field,max_things)) + else: + if not max_things == len(new_names_list): + print(' Max. columns created by splitting %s field is %d but you have given %d variable names only. Selecting first %d' %( + field,max_things,len(new_names_list),len(new_names_list))) + ### This creates a new field that counts the number of things that are in that field. + if add_count_field: + num_products_viewed = 'count_things_in_'+field + df[num_products_viewed] = df[field].map(lambda x: len(x.split(";"))).values + ### Clean up the field such that it has the right number of split chars otherwise add to it + df[field] = df[field].map(lambda x: x+splitter*(max_things-len(x.split(";"))) if len(x.split(";")) < max_things else x) + ###### Now you create new fields by split the one large field ######## + if new_names_list == '': + new_names_list = [field+'_'+str(i) for i in range(1,max_things+1)] + try: + for i in range(len(new_names_list)): + df[field].fillna(filler, inplace=True) + df.loc[df[field] == splitter, field] = filler + df[new_names_list[i]] = df[field].map(lambda x: x.split(splitter)[i] + if splitter in x else x) + except: + ### Check if the column is a string column. If not, give an error message. + print('Cannot split the column. Getting an error. Check the column again') + return df + return df, new_names_list +########################################################################### +def add_aggregate_primitive_features(dft, agg_types, id_column, ignore_variables=[]): + """ + ### Modify Dataframe by adding computational primitive Features using Feature Tools #### + ### What are aggregate primitives? they are to "mean""median","mode","min","max", etc. features + ### Inputs: + ### df: Just sent in the data frame df that you want features added to + ### agg_types: list of computational types: 'mean','median','count', 'max', 'min', 'sum', etc. + ### One caveat: these agg_types must be found in the agg_func of numpy or pandas groupby statement. + ### for example: numpy has 'median','prod','sum','std','var', etc. - they will work! + ### idcolumn: this is to create an index for the dataframe since FT runs on index variable. You can leave it empty string. + ### ignore_variables: list of variables to ignore among numeric variables in data since they may be ID variables. + """ + import copy + ### Make sure the list of functions they send in are acceptable functions. If not, the aggregate will blow up! + func_set = {'count','sum','mean','mad','median','min','max','mode','abs','prod','std','var','sem','skew','kurt','quantile','cumsum','cumprod','cummax','cummin'} + agg_types = list(set(agg_types).intersection(func_set)) + ### If the ignore_variables list is empty, make sure you add the id_column to it so it can be dropped from aggregation. + if len(ignore_variables) == 0: + ignore_variables = [id_column] + ### Select only integer and float variables to do this aggregation on. Be very careful if there are too many vars. + ### This will take time to run in that case. + dft_index = copy.deepcopy(dft[id_column]) + dft_cont = copy.deepcopy(dft.select_dtypes('number').drop(ignore_variables,axis=1)) + dft_cont[id_column] = dft_index + try: + dft_full = dft_cont.groupby(id_column).agg(agg_types) + except: + ### if for some reason, the groupby blows up, then just return the dataframe as is - no changes! + return dft + cols = [x+'_'+y+'_by_'+id_column for (x,y) in dft_full.columns] + dft_full.columns = cols + ### Not every column has useful values. If it is full of just the same value, remove it + _, list_unique_col_ids = np.unique(dft_full, axis = 1, return_index=True) + dft_full = dft_full.iloc[:, list_unique_col_ids] + return dft_full +################################################################################################################################ +import copy +############################################################## +def create_ts_features(df, tscol): + """ + This takes in input a dataframe and a date variable. + It then creates time series features using the pandas .dt.weekday kind of syntax. + It also returns the data frame of added features with each variable as an integer variable. + """ + df = copy.deepcopy(df) + dt_adds = [] + try: + df[tscol+'_hour'] = df[tscol].dt.hour.astype(int) + df[tscol+'_minute'] = df[tscol].dt.minute.astype(int) + dt_adds.append(tscol+'_hour') + dt_adds.append(tscol+'_minute') + except: + print(' Error in creating hour-second derived features. Continuing...') + try: + df[tscol+'_dayofweek'] = df[tscol].dt.dayofweek.astype(int) + dt_adds.append(tscol+'_dayofweek') + df[tscol+'_quarter'] = df[tscol].dt.quarter.astype(int) + dt_adds.append(tscol+'_quarter') + df[tscol+'_month'] = df[tscol].dt.month.astype(int) + dt_adds.append(tscol+'_month') + df[tscol+'_year'] = df[tscol].dt.year.astype(int) + dt_adds.append(tscol+'_year') + today = date.today() + df[tscol+'_age_in_years'] = today.year - df[tscol].dt.year.astype(int) + dt_adds.append(tscol+'_age_in_years') + df[tscol+'_dayofyear'] = df[tscol].dt.dayofyear.astype(int) + dt_adds.append(tscol+'_dayofyear') + df[tscol+'_dayofmonth'] = df[tscol].dt.day.astype(int) + dt_adds.append(tscol+'_dayofmonth') + df[tscol+'_weekofyear'] = df[tscol].dt.weekofyear.astype(int) + dt_adds.append(tscol+'_weekofyear') + weekends = (df[tscol+'_dayofweek'] == 5) | (df[tscol+'_dayofweek'] == 6) + df[tscol+'_weekend'] = 0 + df.loc[weekends, tscol+'_weekend'] = 1 + df[tscol+'_weekend'] = df[tscol+'_weekend'].astype(int) + dt_adds.append(tscol+'_weekend') + except: + print(' Error in creating date time derived features. Continuing...') + df = df[dt_adds].fillna(0).astype(int) + return df +################################################################ +from dateutil.relativedelta import relativedelta +from datetime import date +##### This is a little utility that computes age from year #### +def compute_age(year_string): + today = date.today() + age = relativedelta(today, year_string) + return age.years +################################################################# +def create_time_series_features(dtf, ts_column): + """ + This creates between 8 and 10 date time features for each date variable. The number of features + depends on whether it is just a year variable or a year+month+day and whether it has hours and mins+secs. + So this can create all these features using just the date time column that you send in. + It returns the entire dataframe with added variables as output. + """ + dtf = copy.deepcopy(dtf) + #### If for some reason ts_column is just a number, make sure it is a string so it does not blow up and concatenated + if not isinstance(ts_column,str): + ts_column = str(ts_column) + try: + ### In some extreme cases, date time vars are not processed yet and hence we must fill missing values here! + if dtf[ts_column].isnull().sum() > 0: + missing_flag = True + new_missing_col = ts_column + '_Missing_Flag' + dtf[new_missing_col] = 0 + dtf.loc[dtf[ts_column].isnull(),new_missing_col]=1 + dtf[ts_column] = dtf[ts_column].fillna(method='ffill') + if dtf[ts_column].dtype in [np.float64,np.float32,np.float16]: + dtf[ts_column] = dtf[ts_column].astype(int) + ### if we have already found that it was a date time var, then leave it as it is. Thats good enough! + date_items = dtf[ts_column].apply(str).apply(len).values + #### In some extreme cases, + if all(date_items[0] == item for item in date_items): + if date_items[0] == 4: + ### If it is just a year variable alone, you should leave it as just a year! + age_col = ts_column+'_age_in_years' + dtf[age_col] = dtf[ts_column].map(lambda x: pd.to_datetime(x,format='%Y')).apply(compute_age).values + return dtf[[ts_column,age_col]] + else: + ### if it is not a year alone, then convert it into a date time variable + dtf[ts_column] = pd.to_datetime(dtf[ts_column], infer_datetime_format=True) + else: + dtf[ts_column] = pd.to_datetime(dtf[ts_column], infer_datetime_format=True) + dtf = create_ts_features(dtf,ts_column) + except: + print('Error in Processing %s column for date time features. Continuing...' %ts_column) + return dtf +###################################################################################### diff --git a/requirements.txt b/requirements.txt index 776acc1..68e23f7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,8 @@ -ipython -jupyter -xgboost>=1.1.1 -pandas -matplotlib -seaborn -scikit-learn>=0.23.1 +ipython +jupyter +xgboost>=1.1.1 +pandas +matplotlib +seaborn +scikit-learn>=0.23.1 networkx \ No newline at end of file diff --git a/setup.py b/setup.py index c19bf3f..86facd6 100644 --- a/setup.py +++ b/setup.py @@ -1,33 +1,33 @@ -#!/usr/bin/env python - -import setuptools - -with open("README.md", "r", encoding="utf-8") as fh: - long_description = fh.read() - -setuptools.setup( - name="featurewiz", - version="0.0.7", - author="Ram Seshadri", - author_email="rsesha2001@yahoo.com", - description="Select Best Features from your data set - any size - now with XGBoost!", - long_description=long_description, - long_description_content_type="text/markdown", - license='Apache License 2.0', - url="https://github.com/AutoViML/featurewiz", - packages=setuptools.find_packages(exclude=("tests",)), - install_requires=[ - "ipython", - "jupyter", - "xgboost>=1.1.1", - "pandas", - "matplotlib", - "seaborn", - "scikit-learn>=0.23.1", - "networkx", - ], - classifiers=[ - "Programming Language :: Python :: 3", - "Operating System :: OS Independent", - ], -) +#!/usr/bin/env python + +import setuptools + +with open("README.md", "r", encoding="utf-8") as fh: + long_description = fh.read() + +setuptools.setup( + name="featurewiz", + version="0.0.7", + author="Ram Seshadri", + author_email="rsesha2001@yahoo.com", + description="Select Best Features from your data set - any size - now with XGBoost!", + long_description=long_description, + long_description_content_type="text/markdown", + license='Apache License 2.0', + url="https://github.com/AutoViML/featurewiz", + packages=setuptools.find_packages(exclude=("tests",)), + install_requires=[ + "ipython", + "jupyter", + "xgboost>=1.1.1", + "pandas", + "matplotlib", + "seaborn", + "scikit-learn>=0.23.1", + "networkx", + ], + classifiers=[ + "Programming Language :: Python :: 3", + "Operating System :: OS Independent", + ], +)